commit 789de700815b959ae339f1ae24467c359b962517 Author: ModelHub XC Date: Mon Jun 8 18:47:19 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Neelectric/Llama-3.1-8B-Instruct_SFT_mathfisher_v00.02_s43 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..e7cc983 --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/OpenR1-Math-220k_all_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_mathfisher_v00.02_s43 +tags: +- generated_from_trainer +- open-r1 +- sft +- trl +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_mathfisher_v00.02_s43 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/OpenR1-Math-220k_all_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/OpenR1-Math-220k_all_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_mathfisher_v00.02_s43", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_math/runs/xud6i1hx) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.1.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.5 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..3c49d40 --- /dev/null +++ b/all_results.json @@ -0,0 +1,9 @@ +{ + "ewc_loss": 2.968311309814453e-05, + "total_flos": 5.628159003328302e+19, + "train_loss": 0.4093159021503956, + "train_runtime": 46764.448, + "train_samples": 125770, + "train_samples_per_second": 8.068, + "train_steps_per_second": 0.504 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..06df27b --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..50f6077 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..938828e --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd296778877fc5ea92aefe348d8103343e1f42b615d13084869e1e4e3a0210fa +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..e494ec1 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dc571187e1f530b666c5c32f76cd730e5b62015b67a3efa0675f623cac9039d +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..72a7a65 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f43c7f8177bbd6c2505b2326f212b6d572db398056f9036b52d3e9e3baaf4d90 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..6d08c8d --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3cf754e73834043778dfe3d020d62b30a7e98a3325902d3b254e429bff6c62a +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9d4773c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,11 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3beeacc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..3c49d40 --- /dev/null +++ b/train_results.json @@ -0,0 +1,9 @@ +{ + "ewc_loss": 2.968311309814453e-05, + "total_flos": 5.628159003328302e+19, + "train_loss": 0.4093159021503956, + "train_runtime": 46764.448, + "train_samples": 125770, + "train_samples_per_second": 8.068, + "train_steps_per_second": 0.504 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..fc68c41 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,235874 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23583, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012721027859051011, + "ewc_loss": 0.0, + "grad_norm": 5.112785816192627, + "learning_rate": 0.0, + "loss": 0.8594, + "mean_token_accuracy": 0.7632832527160645, + "num_tokens": 39170.0, + "step": 1 + }, + { + "epoch": 0.00025442055718102023, + "ewc_loss": 0.0, + "grad_norm": 4.659204959869385, + "learning_rate": 4.2390843577787196e-10, + "loss": 0.8038, + "mean_token_accuracy": 0.7740107774734497, + "num_tokens": 79650.0, + "step": 2 + }, + { + "epoch": 0.0003816308357715303, + "ewc_loss": 5.55653613398821e-19, + "grad_norm": 4.923055171966553, + "learning_rate": 8.478168715557439e-10, + "loss": 0.7784, + "mean_token_accuracy": 0.782990574836731, + "num_tokens": 116895.0, + "step": 3 + }, + { + "epoch": 0.0005088411143620405, + "ewc_loss": 2.2985086056692694e-17, + "grad_norm": 4.920858860015869, + "learning_rate": 1.271725307333616e-09, + "loss": 0.8271, + "mean_token_accuracy": 0.7667955160140991, + "num_tokens": 154216.0, + "step": 4 + }, + { + "epoch": 0.0006360513929525506, + "ewc_loss": 6.279698983036042e-16, + "grad_norm": 4.644571781158447, + "learning_rate": 1.6956337431114878e-09, + "loss": 0.7281, + "mean_token_accuracy": 0.7933933138847351, + "num_tokens": 192509.0, + "step": 5 + }, + { + "epoch": 0.0007632616715430606, + "ewc_loss": 1.887379141862766e-15, + "grad_norm": 4.795102119445801, + "learning_rate": 2.1195421788893596e-09, + "loss": 0.7785, + "mean_token_accuracy": 0.7843474745750427, + "num_tokens": 228902.0, + "step": 6 + }, + { + "epoch": 0.0008904719501335708, + "ewc_loss": 4.274358644806853e-15, + "grad_norm": 4.29025411605835, + "learning_rate": 2.543450614667232e-09, + "loss": 0.783, + "mean_token_accuracy": 0.7800211906433105, + "num_tokens": 272297.0, + "step": 7 + }, + { + "epoch": 0.001017682228724081, + "ewc_loss": 1.404432126150823e-14, + "grad_norm": 5.016918659210205, + "learning_rate": 2.967359050445104e-09, + "loss": 0.797, + "mean_token_accuracy": 0.7788540124893188, + "num_tokens": 308129.0, + "step": 8 + }, + { + "epoch": 0.001144892507314591, + "ewc_loss": 2.2315482794965646e-14, + "grad_norm": 4.699722766876221, + "learning_rate": 3.3912674862229757e-09, + "loss": 0.8089, + "mean_token_accuracy": 0.7727298736572266, + "num_tokens": 348780.0, + "step": 9 + }, + { + "epoch": 0.0012721027859051012, + "ewc_loss": 5.5289106626332796e-14, + "grad_norm": 4.85343074798584, + "learning_rate": 3.815175922000847e-09, + "loss": 0.7983, + "mean_token_accuracy": 0.7793794870376587, + "num_tokens": 385881.0, + "step": 10 + }, + { + "epoch": 0.0013993130644956112, + "ewc_loss": 9.547918011776346e-14, + "grad_norm": 5.362619400024414, + "learning_rate": 4.239084357778719e-09, + "loss": 0.8739, + "mean_token_accuracy": 0.7613564729690552, + "num_tokens": 419746.0, + "step": 11 + }, + { + "epoch": 0.0015265233430861213, + "ewc_loss": 1.2434497875801753e-13, + "grad_norm": 5.138889312744141, + "learning_rate": 4.662992793556591e-09, + "loss": 0.8359, + "mean_token_accuracy": 0.7706336975097656, + "num_tokens": 452147.0, + "step": 12 + }, + { + "epoch": 0.0016537336216766315, + "ewc_loss": 1.5720758028692217e-13, + "grad_norm": 5.1318559646606445, + "learning_rate": 5.086901229334464e-09, + "loss": 0.8535, + "mean_token_accuracy": 0.7633542418479919, + "num_tokens": 490241.0, + "step": 13 + }, + { + "epoch": 0.0017809439002671415, + "ewc_loss": 5.115907697472721e-13, + "grad_norm": 4.569637775421143, + "learning_rate": 5.510809665112336e-09, + "loss": 0.744, + "mean_token_accuracy": 0.792755126953125, + "num_tokens": 531023.0, + "step": 14 + }, + { + "epoch": 0.0019081541788576518, + "ewc_loss": 7.176481631177012e-13, + "grad_norm": 4.587464332580566, + "learning_rate": 5.934718100890208e-09, + "loss": 0.7984, + "mean_token_accuracy": 0.7754523158073425, + "num_tokens": 573152.0, + "step": 15 + }, + { + "epoch": 0.002035364457448162, + "ewc_loss": 8.490985692333197e-13, + "grad_norm": 4.722011566162109, + "learning_rate": 6.3586265366680796e-09, + "loss": 0.8838, + "mean_token_accuracy": 0.7516868710517883, + "num_tokens": 615157.0, + "step": 16 + }, + { + "epoch": 0.002162574736038672, + "ewc_loss": 1.0018652574217413e-12, + "grad_norm": 5.628353595733643, + "learning_rate": 6.782534972445951e-09, + "loss": 0.8235, + "mean_token_accuracy": 0.7704832553863525, + "num_tokens": 645632.0, + "step": 17 + }, + { + "epoch": 0.002289785014629182, + "ewc_loss": 1.1013412404281553e-12, + "grad_norm": 4.596994876861572, + "learning_rate": 7.206443408223823e-09, + "loss": 0.8579, + "mean_token_accuracy": 0.7667489051818848, + "num_tokens": 688798.0, + "step": 18 + }, + { + "epoch": 0.0024169952932196924, + "ewc_loss": 4.149569576838985e-12, + "grad_norm": 5.109372138977051, + "learning_rate": 7.630351844001695e-09, + "loss": 0.8153, + "mean_token_accuracy": 0.7732251882553101, + "num_tokens": 722451.0, + "step": 19 + }, + { + "epoch": 0.0025442055718102024, + "ewc_loss": 5.5990767577895895e-12, + "grad_norm": 4.779317855834961, + "learning_rate": 8.054260279779567e-09, + "loss": 0.8642, + "mean_token_accuracy": 0.7603018283843994, + "num_tokens": 761569.0, + "step": 20 + }, + { + "epoch": 0.0026714158504007124, + "ewc_loss": 6.394884621840902e-12, + "grad_norm": 4.522881031036377, + "learning_rate": 8.478168715557438e-09, + "loss": 0.8067, + "mean_token_accuracy": 0.7781086564064026, + "num_tokens": 803477.0, + "step": 21 + }, + { + "epoch": 0.0027986261289912225, + "ewc_loss": 6.764366844436154e-12, + "grad_norm": 4.7256951332092285, + "learning_rate": 8.902077151335311e-09, + "loss": 0.7542, + "mean_token_accuracy": 0.788487434387207, + "num_tokens": 839552.0, + "step": 22 + }, + { + "epoch": 0.0029258364075817325, + "ewc_loss": 7.901235221652314e-12, + "grad_norm": 5.058137893676758, + "learning_rate": 9.325985587113182e-09, + "loss": 0.7801, + "mean_token_accuracy": 0.7803577780723572, + "num_tokens": 874137.0, + "step": 23 + }, + { + "epoch": 0.0030530466861722425, + "ewc_loss": 8.58335624798201e-12, + "grad_norm": 5.046248912811279, + "learning_rate": 9.749894022891054e-09, + "loss": 0.8125, + "mean_token_accuracy": 0.7737522125244141, + "num_tokens": 909950.0, + "step": 24 + }, + { + "epoch": 0.003180256964762753, + "ewc_loss": 9.379164112033322e-12, + "grad_norm": 4.67251443862915, + "learning_rate": 1.0173802458668929e-08, + "loss": 0.7974, + "mean_token_accuracy": 0.7793619632720947, + "num_tokens": 948927.0, + "step": 25 + }, + { + "epoch": 0.003307467243353263, + "ewc_loss": 2.955857780762017e-11, + "grad_norm": 4.59185266494751, + "learning_rate": 1.05977108944468e-08, + "loss": 0.7634, + "mean_token_accuracy": 0.7865217924118042, + "num_tokens": 987077.0, + "step": 26 + }, + { + "epoch": 0.003434677521943773, + "ewc_loss": 3.9108272176235914e-11, + "grad_norm": 5.098145008087158, + "learning_rate": 1.1021619330224672e-08, + "loss": 0.8463, + "mean_token_accuracy": 0.7653138637542725, + "num_tokens": 1023947.0, + "step": 27 + }, + { + "epoch": 0.003561887800534283, + "ewc_loss": 4.297362465877086e-11, + "grad_norm": 4.744552135467529, + "learning_rate": 1.1445527766002543e-08, + "loss": 0.7798, + "mean_token_accuracy": 0.7845408320426941, + "num_tokens": 1060700.0, + "step": 28 + }, + { + "epoch": 0.003689098079124793, + "ewc_loss": 4.7066350816749036e-11, + "grad_norm": 4.601330757141113, + "learning_rate": 1.1869436201780416e-08, + "loss": 0.768, + "mean_token_accuracy": 0.7799510955810547, + "num_tokens": 1099464.0, + "step": 29 + }, + { + "epoch": 0.0038163083577153036, + "ewc_loss": 4.956746124662459e-11, + "grad_norm": 4.175507068634033, + "learning_rate": 1.2293344637558287e-08, + "loss": 0.7606, + "mean_token_accuracy": 0.7846396565437317, + "num_tokens": 1146534.0, + "step": 30 + }, + { + "epoch": 0.003943518636305814, + "ewc_loss": 5.5933924159035087e-11, + "grad_norm": 5.002677917480469, + "learning_rate": 1.2717253073336159e-08, + "loss": 0.8431, + "mean_token_accuracy": 0.7615231871604919, + "num_tokens": 1184539.0, + "step": 31 + }, + { + "epoch": 0.004070728914896324, + "ewc_loss": 6.139089236967266e-11, + "grad_norm": 4.856429100036621, + "learning_rate": 1.314116150911403e-08, + "loss": 0.8255, + "mean_token_accuracy": 0.7712791562080383, + "num_tokens": 1224478.0, + "step": 32 + }, + { + "epoch": 0.004197939193486834, + "ewc_loss": 6.411937647499144e-11, + "grad_norm": 4.413727760314941, + "learning_rate": 1.3565069944891903e-08, + "loss": 0.7652, + "mean_token_accuracy": 0.783180296421051, + "num_tokens": 1266135.0, + "step": 33 + }, + { + "epoch": 0.004325149472077344, + "ewc_loss": 7.003109203651547e-11, + "grad_norm": 4.7614426612854, + "learning_rate": 1.3988978380669775e-08, + "loss": 0.8153, + "mean_token_accuracy": 0.7738863229751587, + "num_tokens": 1306494.0, + "step": 34 + }, + { + "epoch": 0.004452359750667854, + "ewc_loss": 8.094502845779061e-11, + "grad_norm": 5.030433177947998, + "learning_rate": 1.4412886816447646e-08, + "loss": 0.8274, + "mean_token_accuracy": 0.7715458273887634, + "num_tokens": 1341861.0, + "step": 35 + }, + { + "epoch": 0.004579570029258364, + "ewc_loss": 2.2100721253082156e-10, + "grad_norm": 5.128272533416748, + "learning_rate": 1.4836795252225519e-08, + "loss": 0.7848, + "mean_token_accuracy": 0.7845706939697266, + "num_tokens": 1373282.0, + "step": 36 + }, + { + "epoch": 0.004706780307848874, + "ewc_loss": 2.837623469531536e-10, + "grad_norm": 4.993134498596191, + "learning_rate": 1.526070368800339e-08, + "loss": 0.8242, + "mean_token_accuracy": 0.7725497484207153, + "num_tokens": 1410197.0, + "step": 37 + }, + { + "epoch": 0.004833990586439385, + "ewc_loss": 3.183231456205249e-10, + "grad_norm": 4.593441009521484, + "learning_rate": 1.5684612123781262e-08, + "loss": 0.7762, + "mean_token_accuracy": 0.7798409461975098, + "num_tokens": 1451151.0, + "step": 38 + }, + { + "epoch": 0.004961200865029895, + "ewc_loss": 3.4378899727016687e-10, + "grad_norm": 5.400876522064209, + "learning_rate": 1.6108520559559135e-08, + "loss": 0.8076, + "mean_token_accuracy": 0.7722047567367554, + "num_tokens": 1482491.0, + "step": 39 + }, + { + "epoch": 0.005088411143620405, + "ewc_loss": 3.710738383233547e-10, + "grad_norm": 4.703530311584473, + "learning_rate": 1.6532428995337004e-08, + "loss": 0.8121, + "mean_token_accuracy": 0.7772910594940186, + "num_tokens": 1518599.0, + "step": 40 + }, + { + "epoch": 0.005215621422210915, + "ewc_loss": 3.8198777474462986e-10, + "grad_norm": 4.494019031524658, + "learning_rate": 1.6956337431114877e-08, + "loss": 0.8342, + "mean_token_accuracy": 0.771221399307251, + "num_tokens": 1559022.0, + "step": 41 + }, + { + "epoch": 0.005342831700801425, + "ewc_loss": 3.965396899729967e-10, + "grad_norm": 4.720749378204346, + "learning_rate": 1.738024586689275e-08, + "loss": 0.8229, + "mean_token_accuracy": 0.7739077806472778, + "num_tokens": 1598559.0, + "step": 42 + }, + { + "epoch": 0.005470041979391935, + "ewc_loss": 4.2746250983327627e-10, + "grad_norm": 4.861265659332275, + "learning_rate": 1.7804154302670622e-08, + "loss": 0.8263, + "mean_token_accuracy": 0.7691702842712402, + "num_tokens": 1638212.0, + "step": 43 + }, + { + "epoch": 0.005597252257982445, + "ewc_loss": 4.656612873077393e-10, + "grad_norm": 4.719359874725342, + "learning_rate": 1.8228062738448494e-08, + "loss": 0.7986, + "mean_token_accuracy": 0.7768405079841614, + "num_tokens": 1679236.0, + "step": 44 + }, + { + "epoch": 0.005724462536572955, + "ewc_loss": 4.94765117764473e-10, + "grad_norm": 4.862114429473877, + "learning_rate": 1.8651971174226364e-08, + "loss": 0.8015, + "mean_token_accuracy": 0.7770185470581055, + "num_tokens": 1717438.0, + "step": 45 + }, + { + "epoch": 0.005851672815163465, + "ewc_loss": 5.093170329928398e-10, + "grad_norm": 5.124839782714844, + "learning_rate": 1.9075879610004236e-08, + "loss": 0.8209, + "mean_token_accuracy": 0.7696338295936584, + "num_tokens": 1750364.0, + "step": 46 + }, + { + "epoch": 0.005978883093753975, + "ewc_loss": 5.311449058353901e-10, + "grad_norm": 4.877604007720947, + "learning_rate": 1.949978804578211e-08, + "loss": 0.8704, + "mean_token_accuracy": 0.7667209506034851, + "num_tokens": 1789042.0, + "step": 47 + }, + { + "epoch": 0.006106093372344485, + "ewc_loss": 5.456968210637569e-10, + "grad_norm": 5.342173099517822, + "learning_rate": 1.9923696481559985e-08, + "loss": 0.8533, + "mean_token_accuracy": 0.7621864080429077, + "num_tokens": 1823772.0, + "step": 48 + }, + { + "epoch": 0.006233303650934996, + "ewc_loss": 7.712515071034431e-10, + "grad_norm": 4.631859302520752, + "learning_rate": 2.0347604917337857e-08, + "loss": 0.8355, + "mean_token_accuracy": 0.7666096091270447, + "num_tokens": 1867281.0, + "step": 49 + }, + { + "epoch": 0.006360513929525506, + "ewc_loss": 1.5643308870494366e-09, + "grad_norm": 4.68604850769043, + "learning_rate": 2.0771513353115727e-08, + "loss": 0.7673, + "mean_token_accuracy": 0.7846611738204956, + "num_tokens": 1906609.0, + "step": 50 + }, + { + "epoch": 0.006487724208116016, + "ewc_loss": 1.862645149230957e-09, + "grad_norm": 4.366217613220215, + "learning_rate": 2.11954217888936e-08, + "loss": 0.7366, + "mean_token_accuracy": 0.7953136563301086, + "num_tokens": 1946715.0, + "step": 51 + }, + { + "epoch": 0.006614934486706526, + "ewc_loss": 2.051820047199726e-09, + "grad_norm": 4.623172760009766, + "learning_rate": 2.1619330224671472e-08, + "loss": 0.7953, + "mean_token_accuracy": 0.7764700651168823, + "num_tokens": 1985337.0, + "step": 52 + }, + { + "epoch": 0.006742144765297036, + "ewc_loss": 2.255546860396862e-09, + "grad_norm": 5.085038661956787, + "learning_rate": 2.2043238660449344e-08, + "loss": 0.7787, + "mean_token_accuracy": 0.7801711559295654, + "num_tokens": 2017464.0, + "step": 53 + }, + { + "epoch": 0.006869355043887546, + "ewc_loss": 2.3865140974521637e-09, + "grad_norm": 4.669393539428711, + "learning_rate": 2.2467147096227214e-08, + "loss": 0.7835, + "mean_token_accuracy": 0.7816396951675415, + "num_tokens": 2056793.0, + "step": 54 + }, + { + "epoch": 0.006996565322478056, + "ewc_loss": 2.444721758365631e-09, + "grad_norm": 4.684264183044434, + "learning_rate": 2.2891055532005086e-08, + "loss": 0.7937, + "mean_token_accuracy": 0.7783284187316895, + "num_tokens": 2093745.0, + "step": 55 + }, + { + "epoch": 0.007123775601068566, + "ewc_loss": 2.561137080192566e-09, + "grad_norm": 4.7678680419921875, + "learning_rate": 2.331496396778296e-08, + "loss": 0.8187, + "mean_token_accuracy": 0.7751220464706421, + "num_tokens": 2134168.0, + "step": 56 + }, + { + "epoch": 0.007250985879659076, + "ewc_loss": 2.6338966563344e-09, + "grad_norm": 4.315008640289307, + "learning_rate": 2.373887240356083e-08, + "loss": 0.812, + "mean_token_accuracy": 0.7730646729469299, + "num_tokens": 2178105.0, + "step": 57 + }, + { + "epoch": 0.007378196158249586, + "ewc_loss": 2.7212081477046013e-09, + "grad_norm": 4.733705997467041, + "learning_rate": 2.4162780839338704e-08, + "loss": 0.8096, + "mean_token_accuracy": 0.7749797701835632, + "num_tokens": 2215312.0, + "step": 58 + }, + { + "epoch": 0.007505406436840096, + "ewc_loss": 2.764863893389702e-09, + "grad_norm": 4.732920169830322, + "learning_rate": 2.4586689275116573e-08, + "loss": 0.7914, + "mean_token_accuracy": 0.784144937992096, + "num_tokens": 2252517.0, + "step": 59 + }, + { + "epoch": 0.007632616715430607, + "ewc_loss": 2.8812792152166367e-09, + "grad_norm": 4.845442771911621, + "learning_rate": 2.5010597710894446e-08, + "loss": 0.8167, + "mean_token_accuracy": 0.7739465236663818, + "num_tokens": 2287396.0, + "step": 60 + }, + { + "epoch": 0.007759826994021117, + "ewc_loss": 3.0850060284137726e-09, + "grad_norm": 5.037092208862305, + "learning_rate": 2.5434506146672318e-08, + "loss": 0.8416, + "mean_token_accuracy": 0.766313910484314, + "num_tokens": 2321104.0, + "step": 61 + }, + { + "epoch": 0.007887037272611627, + "ewc_loss": 3.2741809263825417e-09, + "grad_norm": 4.89307975769043, + "learning_rate": 2.585841458245019e-08, + "loss": 0.7976, + "mean_token_accuracy": 0.7720996141433716, + "num_tokens": 2358154.0, + "step": 62 + }, + { + "epoch": 0.008014247551202136, + "ewc_loss": 3.434251993894577e-09, + "grad_norm": 4.740861415863037, + "learning_rate": 2.628232301822806e-08, + "loss": 0.8187, + "mean_token_accuracy": 0.7698023319244385, + "num_tokens": 2394772.0, + "step": 63 + }, + { + "epoch": 0.008141457829792647, + "ewc_loss": 3.5652192309498787e-09, + "grad_norm": 4.424066066741943, + "learning_rate": 2.6706231454005933e-08, + "loss": 0.7172, + "mean_token_accuracy": 0.8008853793144226, + "num_tokens": 2433300.0, + "step": 64 + }, + { + "epoch": 0.008268668108383158, + "ewc_loss": 3.65253072232008e-09, + "grad_norm": 4.515802383422852, + "learning_rate": 2.7130139889783805e-08, + "loss": 0.8065, + "mean_token_accuracy": 0.7738178968429565, + "num_tokens": 2475070.0, + "step": 65 + }, + { + "epoch": 0.008395878386973667, + "ewc_loss": 3.725290298461914e-09, + "grad_norm": 4.544943809509277, + "learning_rate": 2.7554048325561678e-08, + "loss": 0.7763, + "mean_token_accuracy": 0.7835286855697632, + "num_tokens": 2513688.0, + "step": 66 + }, + { + "epoch": 0.008523088665564178, + "ewc_loss": 3.754394128918648e-09, + "grad_norm": 4.680739402770996, + "learning_rate": 2.797795676133955e-08, + "loss": 0.7763, + "mean_token_accuracy": 0.7835365533828735, + "num_tokens": 2549423.0, + "step": 67 + }, + { + "epoch": 0.008650298944154687, + "ewc_loss": 4.0745362639427185e-09, + "grad_norm": 4.672920227050781, + "learning_rate": 2.840186519711742e-08, + "loss": 0.8194, + "mean_token_accuracy": 0.7684056162834167, + "num_tokens": 2587865.0, + "step": 68 + }, + { + "epoch": 0.008777509222745198, + "ewc_loss": 5.995389074087143e-09, + "grad_norm": 4.662656784057617, + "learning_rate": 2.8825773632895292e-08, + "loss": 0.8055, + "mean_token_accuracy": 0.7730819582939148, + "num_tokens": 2623591.0, + "step": 69 + }, + { + "epoch": 0.008904719501335707, + "ewc_loss": 9.022187441587448e-09, + "grad_norm": 4.449329376220703, + "learning_rate": 2.9249682068673165e-08, + "loss": 0.7336, + "mean_token_accuracy": 0.7902768850326538, + "num_tokens": 2661403.0, + "step": 70 + }, + { + "epoch": 0.009031929779926218, + "ewc_loss": 1.0710209608078003e-08, + "grad_norm": 4.385138511657715, + "learning_rate": 2.9673590504451037e-08, + "loss": 0.7946, + "mean_token_accuracy": 0.774881899356842, + "num_tokens": 2704442.0, + "step": 71 + }, + { + "epoch": 0.009159140058516728, + "ewc_loss": 1.1699739843606949e-08, + "grad_norm": 4.709349155426025, + "learning_rate": 3.0097498940228907e-08, + "loss": 0.8748, + "mean_token_accuracy": 0.7535002827644348, + "num_tokens": 2739423.0, + "step": 72 + }, + { + "epoch": 0.009286350337107238, + "ewc_loss": 1.2514647096395493e-08, + "grad_norm": 4.616166114807129, + "learning_rate": 3.052140737600678e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.7799589037895203, + "num_tokens": 2775800.0, + "step": 73 + }, + { + "epoch": 0.009413560615697748, + "ewc_loss": 1.3154931366443634e-08, + "grad_norm": 4.057197093963623, + "learning_rate": 3.094531581178465e-08, + "loss": 0.7133, + "mean_token_accuracy": 0.7990990877151489, + "num_tokens": 2817394.0, + "step": 74 + }, + { + "epoch": 0.009540770894288259, + "ewc_loss": 1.3562384992837906e-08, + "grad_norm": 4.607683181762695, + "learning_rate": 3.1369224247562524e-08, + "loss": 0.8608, + "mean_token_accuracy": 0.7589159607887268, + "num_tokens": 2854140.0, + "step": 75 + }, + { + "epoch": 0.00966798117287877, + "ewc_loss": 1.4028046280145645e-08, + "grad_norm": 4.2435712814331055, + "learning_rate": 3.17931326833404e-08, + "loss": 0.7412, + "mean_token_accuracy": 0.7890951633453369, + "num_tokens": 2894492.0, + "step": 76 + }, + { + "epoch": 0.009795191451469279, + "ewc_loss": 1.4435499906539917e-08, + "grad_norm": 4.684937000274658, + "learning_rate": 3.221704111911827e-08, + "loss": 0.8079, + "mean_token_accuracy": 0.7759627103805542, + "num_tokens": 2929726.0, + "step": 77 + }, + { + "epoch": 0.00992240173005979, + "ewc_loss": 1.4842953532934189e-08, + "grad_norm": 4.451656341552734, + "learning_rate": 3.264094955489614e-08, + "loss": 0.8906, + "mean_token_accuracy": 0.7509020566940308, + "num_tokens": 2972039.0, + "step": 78 + }, + { + "epoch": 0.010049612008650299, + "ewc_loss": 1.501757651567459e-08, + "grad_norm": 4.416698455810547, + "learning_rate": 3.306485799067401e-08, + "loss": 0.7974, + "mean_token_accuracy": 0.7750389575958252, + "num_tokens": 3011924.0, + "step": 79 + }, + { + "epoch": 0.01017682228724081, + "ewc_loss": 1.5366822481155396e-08, + "grad_norm": 4.450709342956543, + "learning_rate": 3.348876642645188e-08, + "loss": 0.8061, + "mean_token_accuracy": 0.7759023904800415, + "num_tokens": 3049786.0, + "step": 80 + }, + { + "epoch": 0.010304032565831319, + "ewc_loss": 1.548323780298233e-08, + "grad_norm": 4.223391532897949, + "learning_rate": 3.391267486222975e-08, + "loss": 0.8158, + "mean_token_accuracy": 0.771920919418335, + "num_tokens": 3092655.0, + "step": 81 + }, + { + "epoch": 0.01043124284442183, + "ewc_loss": 1.594889909029007e-08, + "grad_norm": 4.440260410308838, + "learning_rate": 3.4336583298007626e-08, + "loss": 0.8412, + "mean_token_accuracy": 0.7658563256263733, + "num_tokens": 3130378.0, + "step": 82 + }, + { + "epoch": 0.010558453123012339, + "ewc_loss": 1.6298145055770874e-08, + "grad_norm": 4.0345540046691895, + "learning_rate": 3.47604917337855e-08, + "loss": 0.7793, + "mean_token_accuracy": 0.7787570357322693, + "num_tokens": 3177027.0, + "step": 83 + }, + { + "epoch": 0.01068566340160285, + "ewc_loss": 1.664739102125168e-08, + "grad_norm": 4.502503395080566, + "learning_rate": 3.518440016956337e-08, + "loss": 0.8363, + "mean_token_accuracy": 0.7643337845802307, + "num_tokens": 3217500.0, + "step": 84 + }, + { + "epoch": 0.010812873680193359, + "ewc_loss": 1.7229467630386353e-08, + "grad_norm": 4.906854152679443, + "learning_rate": 3.5608308605341244e-08, + "loss": 0.7983, + "mean_token_accuracy": 0.7756432890892029, + "num_tokens": 3250642.0, + "step": 85 + }, + { + "epoch": 0.01094008395878387, + "ewc_loss": 1.816079020500183e-08, + "grad_norm": 4.252003192901611, + "learning_rate": 3.6032217041119116e-08, + "loss": 0.7855, + "mean_token_accuracy": 0.7844841480255127, + "num_tokens": 3288568.0, + "step": 86 + }, + { + "epoch": 0.01106729423737438, + "ewc_loss": 1.862645149230957e-08, + "grad_norm": 4.517517566680908, + "learning_rate": 3.645612547689699e-08, + "loss": 0.8065, + "mean_token_accuracy": 0.7735649347305298, + "num_tokens": 3327948.0, + "step": 87 + }, + { + "epoch": 0.01119450451596489, + "ewc_loss": 1.9208528101444244e-08, + "grad_norm": 4.475348472595215, + "learning_rate": 3.6880033912674855e-08, + "loss": 0.7382, + "mean_token_accuracy": 0.7948620319366455, + "num_tokens": 3365544.0, + "step": 88 + }, + { + "epoch": 0.0113217147945554, + "ewc_loss": 1.9674189388751984e-08, + "grad_norm": 4.333749294281006, + "learning_rate": 3.730394234845273e-08, + "loss": 0.8226, + "mean_token_accuracy": 0.7676917314529419, + "num_tokens": 3405278.0, + "step": 89 + }, + { + "epoch": 0.01144892507314591, + "ewc_loss": 2.0139850676059723e-08, + "grad_norm": 4.475931167602539, + "learning_rate": 3.77278507842306e-08, + "loss": 0.8147, + "mean_token_accuracy": 0.7722347378730774, + "num_tokens": 3442356.0, + "step": 90 + }, + { + "epoch": 0.01157613535173642, + "ewc_loss": 2.0489096641540527e-08, + "grad_norm": 4.0965094566345215, + "learning_rate": 3.815175922000847e-08, + "loss": 0.7858, + "mean_token_accuracy": 0.7788056135177612, + "num_tokens": 3487407.0, + "step": 91 + }, + { + "epoch": 0.01170334563032693, + "ewc_loss": 2.130400389432907e-08, + "grad_norm": 4.734883785247803, + "learning_rate": 3.8575667655786345e-08, + "loss": 0.7937, + "mean_token_accuracy": 0.7793616056442261, + "num_tokens": 3520838.0, + "step": 92 + }, + { + "epoch": 0.01183055590891744, + "ewc_loss": 2.176966518163681e-08, + "grad_norm": 4.586698532104492, + "learning_rate": 3.899957609156422e-08, + "loss": 0.7609, + "mean_token_accuracy": 0.7875656485557556, + "num_tokens": 3555300.0, + "step": 93 + }, + { + "epoch": 0.01195776618750795, + "ewc_loss": 2.2351741790771484e-08, + "grad_norm": 5.005249977111816, + "learning_rate": 3.94234845273421e-08, + "loss": 0.8334, + "mean_token_accuracy": 0.7663681507110596, + "num_tokens": 3587307.0, + "step": 94 + }, + { + "epoch": 0.012084976466098461, + "ewc_loss": 2.3399479687213898e-08, + "grad_norm": 4.518998622894287, + "learning_rate": 3.984739296311997e-08, + "loss": 0.8108, + "mean_token_accuracy": 0.772203803062439, + "num_tokens": 3623953.0, + "step": 95 + }, + { + "epoch": 0.01221218674468897, + "ewc_loss": 2.6891939342021942e-08, + "grad_norm": 4.5671281814575195, + "learning_rate": 4.027130139889784e-08, + "loss": 0.8028, + "mean_token_accuracy": 0.7768554091453552, + "num_tokens": 3658423.0, + "step": 96 + }, + { + "epoch": 0.012339397023279481, + "ewc_loss": 3.189779818058014e-08, + "grad_norm": 4.315270900726318, + "learning_rate": 4.0695209834675715e-08, + "loss": 0.7636, + "mean_token_accuracy": 0.7865623235702515, + "num_tokens": 3695704.0, + "step": 97 + }, + { + "epoch": 0.012466607301869992, + "ewc_loss": 3.91155481338501e-08, + "grad_norm": 4.163238048553467, + "learning_rate": 4.111911827045358e-08, + "loss": 0.7859, + "mean_token_accuracy": 0.7788639664649963, + "num_tokens": 3738881.0, + "step": 98 + }, + { + "epoch": 0.012593817580460501, + "ewc_loss": 4.284083843231201e-08, + "grad_norm": 4.197052478790283, + "learning_rate": 4.154302670623145e-08, + "loss": 0.7915, + "mean_token_accuracy": 0.7793651223182678, + "num_tokens": 3778465.0, + "step": 99 + }, + { + "epoch": 0.012721027859051012, + "ewc_loss": 4.6100467443466187e-08, + "grad_norm": 4.269277095794678, + "learning_rate": 4.1966935142009326e-08, + "loss": 0.805, + "mean_token_accuracy": 0.7770775556564331, + "num_tokens": 3815904.0, + "step": 100 + }, + { + "epoch": 0.012848238137641521, + "ewc_loss": 4.7730281949043274e-08, + "grad_norm": 3.9525256156921387, + "learning_rate": 4.23908435777872e-08, + "loss": 0.7309, + "mean_token_accuracy": 0.7940070629119873, + "num_tokens": 3856586.0, + "step": 101 + }, + { + "epoch": 0.012975448416232032, + "ewc_loss": 4.889443516731262e-08, + "grad_norm": 4.154442310333252, + "learning_rate": 4.281475201356507e-08, + "loss": 0.804, + "mean_token_accuracy": 0.7729145884513855, + "num_tokens": 3894979.0, + "step": 102 + }, + { + "epoch": 0.013102658694822541, + "ewc_loss": 5.052424967288971e-08, + "grad_norm": 4.18245267868042, + "learning_rate": 4.3238660449342943e-08, + "loss": 0.8091, + "mean_token_accuracy": 0.774681806564331, + "num_tokens": 3934400.0, + "step": 103 + }, + { + "epoch": 0.013229868973413052, + "ewc_loss": 5.168840289115906e-08, + "grad_norm": 3.8422420024871826, + "learning_rate": 4.3662568885120816e-08, + "loss": 0.7664, + "mean_token_accuracy": 0.7788236737251282, + "num_tokens": 3977142.0, + "step": 104 + }, + { + "epoch": 0.013357079252003561, + "ewc_loss": 5.2852556109428406e-08, + "grad_norm": 3.9386754035949707, + "learning_rate": 4.408647732089869e-08, + "loss": 0.7976, + "mean_token_accuracy": 0.7761133313179016, + "num_tokens": 4017561.0, + "step": 105 + }, + { + "epoch": 0.013484289530594072, + "ewc_loss": 5.3085386753082275e-08, + "grad_norm": 4.038452625274658, + "learning_rate": 4.451038575667656e-08, + "loss": 0.8088, + "mean_token_accuracy": 0.7759572267532349, + "num_tokens": 4056797.0, + "step": 106 + }, + { + "epoch": 0.013611499809184581, + "ewc_loss": 5.4016709327697754e-08, + "grad_norm": 4.127188205718994, + "learning_rate": 4.493429419245443e-08, + "loss": 0.7581, + "mean_token_accuracy": 0.7894564270973206, + "num_tokens": 4092487.0, + "step": 107 + }, + { + "epoch": 0.013738710087775092, + "ewc_loss": 5.448237061500549e-08, + "grad_norm": 4.055181503295898, + "learning_rate": 4.53582026282323e-08, + "loss": 0.7695, + "mean_token_accuracy": 0.7849301099777222, + "num_tokens": 4130772.0, + "step": 108 + }, + { + "epoch": 0.013865920366365603, + "ewc_loss": 5.587935447692871e-08, + "grad_norm": 4.142572402954102, + "learning_rate": 4.578211106401017e-08, + "loss": 0.8077, + "mean_token_accuracy": 0.7681747674942017, + "num_tokens": 4167818.0, + "step": 109 + }, + { + "epoch": 0.013993130644956112, + "ewc_loss": 5.681067705154419e-08, + "grad_norm": 4.054182529449463, + "learning_rate": 4.6206019499788045e-08, + "loss": 0.743, + "mean_token_accuracy": 0.7873282432556152, + "num_tokens": 4204814.0, + "step": 110 + }, + { + "epoch": 0.014120340923546623, + "ewc_loss": 5.727633833885193e-08, + "grad_norm": 4.079557418823242, + "learning_rate": 4.662992793556592e-08, + "loss": 0.7691, + "mean_token_accuracy": 0.774806559085846, + "num_tokens": 4241461.0, + "step": 111 + }, + { + "epoch": 0.014247551202137132, + "ewc_loss": 5.844049155712128e-08, + "grad_norm": 3.762608289718628, + "learning_rate": 4.705383637134379e-08, + "loss": 0.7684, + "mean_token_accuracy": 0.7810094952583313, + "num_tokens": 4284803.0, + "step": 112 + }, + { + "epoch": 0.014374761480727643, + "ewc_loss": 5.960464477539063e-08, + "grad_norm": 4.240816593170166, + "learning_rate": 4.747774480712166e-08, + "loss": 0.8111, + "mean_token_accuracy": 0.7698889970779419, + "num_tokens": 4323921.0, + "step": 113 + }, + { + "epoch": 0.014501971759318152, + "ewc_loss": 6.05359673500061e-08, + "grad_norm": 4.0135650634765625, + "learning_rate": 4.7901653242899535e-08, + "loss": 0.7287, + "mean_token_accuracy": 0.7908275127410889, + "num_tokens": 4363137.0, + "step": 114 + }, + { + "epoch": 0.014629182037908663, + "ewc_loss": 6.146728992462158e-08, + "grad_norm": 4.145638942718506, + "learning_rate": 4.832556167867741e-08, + "loss": 0.7849, + "mean_token_accuracy": 0.7760776877403259, + "num_tokens": 4400586.0, + "step": 115 + }, + { + "epoch": 0.014756392316499172, + "ewc_loss": 6.332993507385254e-08, + "grad_norm": 4.206759452819824, + "learning_rate": 4.8749470114455274e-08, + "loss": 0.7922, + "mean_token_accuracy": 0.7774192690849304, + "num_tokens": 4436747.0, + "step": 116 + }, + { + "epoch": 0.014883602595089683, + "ewc_loss": 6.472691893577576e-08, + "grad_norm": 4.080277919769287, + "learning_rate": 4.9173378550233146e-08, + "loss": 0.7886, + "mean_token_accuracy": 0.7825331687927246, + "num_tokens": 4473810.0, + "step": 117 + }, + { + "epoch": 0.015010812873680193, + "ewc_loss": 6.612390279769897e-08, + "grad_norm": 3.850743532180786, + "learning_rate": 4.959728698601102e-08, + "loss": 0.6839, + "mean_token_accuracy": 0.8032190799713135, + "num_tokens": 4510078.0, + "step": 118 + }, + { + "epoch": 0.015138023152270703, + "ewc_loss": 6.752088665962219e-08, + "grad_norm": 4.15805196762085, + "learning_rate": 5.002119542178889e-08, + "loss": 0.7744, + "mean_token_accuracy": 0.7821847200393677, + "num_tokens": 4545297.0, + "step": 119 + }, + { + "epoch": 0.015265233430861214, + "ewc_loss": 6.891787052154541e-08, + "grad_norm": 3.959895133972168, + "learning_rate": 5.0445103857566764e-08, + "loss": 0.7963, + "mean_token_accuracy": 0.7749349474906921, + "num_tokens": 4584245.0, + "step": 120 + }, + { + "epoch": 0.015392443709451724, + "ewc_loss": 7.078051567077637e-08, + "grad_norm": 4.133056640625, + "learning_rate": 5.0869012293344637e-08, + "loss": 0.763, + "mean_token_accuracy": 0.7794734239578247, + "num_tokens": 4618901.0, + "step": 121 + }, + { + "epoch": 0.015519653988042234, + "ewc_loss": 7.264316082000732e-08, + "grad_norm": 4.179129123687744, + "learning_rate": 5.129292072912251e-08, + "loss": 0.8528, + "mean_token_accuracy": 0.7568916082382202, + "num_tokens": 4656399.0, + "step": 122 + }, + { + "epoch": 0.015646864266632744, + "ewc_loss": 7.450580596923828e-08, + "grad_norm": 3.995520830154419, + "learning_rate": 5.171682916490038e-08, + "loss": 0.7747, + "mean_token_accuracy": 0.7832850217819214, + "num_tokens": 4692918.0, + "step": 123 + }, + { + "epoch": 0.015774074545223254, + "ewc_loss": 7.543712854385376e-08, + "grad_norm": 4.137028217315674, + "learning_rate": 5.2140737600678254e-08, + "loss": 0.7447, + "mean_token_accuracy": 0.7848802804946899, + "num_tokens": 4728938.0, + "step": 124 + }, + { + "epoch": 0.015901284823813765, + "ewc_loss": 7.636845111846924e-08, + "grad_norm": 4.492431163787842, + "learning_rate": 5.256464603645612e-08, + "loss": 0.7905, + "mean_token_accuracy": 0.7770572900772095, + "num_tokens": 4767227.0, + "step": 125 + }, + { + "epoch": 0.016028495102404273, + "ewc_loss": 7.916241884231567e-08, + "grad_norm": 4.259073257446289, + "learning_rate": 5.298855447223399e-08, + "loss": 0.7601, + "mean_token_accuracy": 0.7861456274986267, + "num_tokens": 4800458.0, + "step": 126 + }, + { + "epoch": 0.016155705380994784, + "ewc_loss": 8.149072527885437e-08, + "grad_norm": 3.8020877838134766, + "learning_rate": 5.3412462908011865e-08, + "loss": 0.7694, + "mean_token_accuracy": 0.7790049910545349, + "num_tokens": 4840976.0, + "step": 127 + }, + { + "epoch": 0.016282915659585295, + "ewc_loss": 8.335337042808533e-08, + "grad_norm": 4.276883125305176, + "learning_rate": 5.383637134378974e-08, + "loss": 0.7916, + "mean_token_accuracy": 0.7760715484619141, + "num_tokens": 4877132.0, + "step": 128 + }, + { + "epoch": 0.016410125938175806, + "ewc_loss": 8.521601557731628e-08, + "grad_norm": 4.123693466186523, + "learning_rate": 5.426027977956761e-08, + "loss": 0.7619, + "mean_token_accuracy": 0.7850927114486694, + "num_tokens": 4914477.0, + "step": 129 + }, + { + "epoch": 0.016537336216766316, + "ewc_loss": 9.033828973770142e-08, + "grad_norm": 4.435949802398682, + "learning_rate": 5.468418821534548e-08, + "loss": 0.7145, + "mean_token_accuracy": 0.7937778234481812, + "num_tokens": 4946850.0, + "step": 130 + }, + { + "epoch": 0.016664546495356824, + "ewc_loss": 9.499490261077881e-08, + "grad_norm": 3.858185291290283, + "learning_rate": 5.5108096651123356e-08, + "loss": 0.7369, + "mean_token_accuracy": 0.7914432287216187, + "num_tokens": 4986922.0, + "step": 131 + }, + { + "epoch": 0.016791756773947335, + "ewc_loss": 1.0058283805847168e-07, + "grad_norm": 3.8570635318756104, + "learning_rate": 5.553200508690123e-08, + "loss": 0.7665, + "mean_token_accuracy": 0.7807555794715881, + "num_tokens": 5027951.0, + "step": 132 + }, + { + "epoch": 0.016918967052537846, + "ewc_loss": 1.0756775736808777e-07, + "grad_norm": 3.4609291553497314, + "learning_rate": 5.59559135226791e-08, + "loss": 0.6743, + "mean_token_accuracy": 0.8056797385215759, + "num_tokens": 5069167.0, + "step": 133 + }, + { + "epoch": 0.017046177331128357, + "ewc_loss": 1.1408701539039612e-07, + "grad_norm": 4.0933427810668945, + "learning_rate": 5.637982195845697e-08, + "loss": 0.7638, + "mean_token_accuracy": 0.7822293639183044, + "num_tokens": 5104247.0, + "step": 134 + }, + { + "epoch": 0.017173387609718864, + "ewc_loss": 1.1920928955078125e-07, + "grad_norm": 4.360756874084473, + "learning_rate": 5.680373039423484e-08, + "loss": 0.7857, + "mean_token_accuracy": 0.7794708013534546, + "num_tokens": 5138563.0, + "step": 135 + }, + { + "epoch": 0.017300597888309375, + "ewc_loss": 1.2479722499847412e-07, + "grad_norm": 3.845708131790161, + "learning_rate": 5.722763883001271e-08, + "loss": 0.7752, + "mean_token_accuracy": 0.7757341861724854, + "num_tokens": 5175329.0, + "step": 136 + }, + { + "epoch": 0.017427808166899886, + "ewc_loss": 1.2945383787155151e-07, + "grad_norm": 3.7758679389953613, + "learning_rate": 5.7651547265790585e-08, + "loss": 0.8185, + "mean_token_accuracy": 0.7648602724075317, + "num_tokens": 5215965.0, + "step": 137 + }, + { + "epoch": 0.017555018445490397, + "ewc_loss": 1.3317912817001343e-07, + "grad_norm": 4.1392107009887695, + "learning_rate": 5.807545570156846e-08, + "loss": 0.7523, + "mean_token_accuracy": 0.7868527173995972, + "num_tokens": 5249119.0, + "step": 138 + }, + { + "epoch": 0.017682228724080904, + "ewc_loss": 1.3690441846847534e-07, + "grad_norm": 4.064820289611816, + "learning_rate": 5.849936413734633e-08, + "loss": 0.781, + "mean_token_accuracy": 0.7734351754188538, + "num_tokens": 5282719.0, + "step": 139 + }, + { + "epoch": 0.017809439002671415, + "ewc_loss": 1.434236764907837e-07, + "grad_norm": 3.978546380996704, + "learning_rate": 5.89232725731242e-08, + "loss": 0.7838, + "mean_token_accuracy": 0.7765612602233887, + "num_tokens": 5322764.0, + "step": 140 + }, + { + "epoch": 0.017936649281261926, + "ewc_loss": 1.471489667892456e-07, + "grad_norm": 4.240859031677246, + "learning_rate": 5.9347181008902075e-08, + "loss": 0.7674, + "mean_token_accuracy": 0.7768316268920898, + "num_tokens": 5351964.0, + "step": 141 + }, + { + "epoch": 0.018063859559852437, + "ewc_loss": 1.4994293451309204e-07, + "grad_norm": 3.396040439605713, + "learning_rate": 5.977108944467995e-08, + "loss": 0.7232, + "mean_token_accuracy": 0.7923865914344788, + "num_tokens": 5397258.0, + "step": 142 + }, + { + "epoch": 0.018191069838442948, + "ewc_loss": 1.5459954738616943e-07, + "grad_norm": 3.901604413986206, + "learning_rate": 6.019499788045781e-08, + "loss": 0.8409, + "mean_token_accuracy": 0.7577580213546753, + "num_tokens": 5434262.0, + "step": 143 + }, + { + "epoch": 0.018318280117033455, + "ewc_loss": 1.5925616025924683e-07, + "grad_norm": 3.642657995223999, + "learning_rate": 6.061890631623569e-08, + "loss": 0.7253, + "mean_token_accuracy": 0.7887012362480164, + "num_tokens": 5473274.0, + "step": 144 + }, + { + "epoch": 0.018445490395623966, + "ewc_loss": 1.6577541828155518e-07, + "grad_norm": 3.3791236877441406, + "learning_rate": 6.104281475201356e-08, + "loss": 0.7368, + "mean_token_accuracy": 0.783409833908081, + "num_tokens": 5510394.0, + "step": 145 + }, + { + "epoch": 0.018572700674214477, + "ewc_loss": 1.685693860054016e-07, + "grad_norm": 3.745713472366333, + "learning_rate": 6.146672318779143e-08, + "loss": 0.7196, + "mean_token_accuracy": 0.7918102741241455, + "num_tokens": 5546570.0, + "step": 146 + }, + { + "epoch": 0.018699910952804988, + "ewc_loss": 1.7043203115463257e-07, + "grad_norm": 3.539794683456421, + "learning_rate": 6.18906316235693e-08, + "loss": 0.7477, + "mean_token_accuracy": 0.7830301523208618, + "num_tokens": 5583687.0, + "step": 147 + }, + { + "epoch": 0.018827121231395495, + "ewc_loss": 1.7136335372924805e-07, + "grad_norm": 3.4681456089019775, + "learning_rate": 6.231454005934718e-08, + "loss": 0.7305, + "mean_token_accuracy": 0.7867828607559204, + "num_tokens": 5619290.0, + "step": 148 + }, + { + "epoch": 0.018954331509986006, + "ewc_loss": 1.73225998878479e-07, + "grad_norm": 3.626227378845215, + "learning_rate": 6.273844849512505e-08, + "loss": 0.8588, + "mean_token_accuracy": 0.7556899785995483, + "num_tokens": 5655751.0, + "step": 149 + }, + { + "epoch": 0.019081541788576517, + "ewc_loss": 1.7508864402770996e-07, + "grad_norm": 2.9950482845306396, + "learning_rate": 6.316235693090292e-08, + "loss": 0.7012, + "mean_token_accuracy": 0.7977627515792847, + "num_tokens": 5701498.0, + "step": 150 + }, + { + "epoch": 0.019208752067167028, + "ewc_loss": 1.7695128917694092e-07, + "grad_norm": 3.6038291454315186, + "learning_rate": 6.35862653666808e-08, + "loss": 0.7767, + "mean_token_accuracy": 0.7783472537994385, + "num_tokens": 5735602.0, + "step": 151 + }, + { + "epoch": 0.01933596234575754, + "ewc_loss": 1.7881393432617188e-07, + "grad_norm": 2.9418275356292725, + "learning_rate": 6.401017380245867e-08, + "loss": 0.6553, + "mean_token_accuracy": 0.8061176538467407, + "num_tokens": 5777558.0, + "step": 152 + }, + { + "epoch": 0.019463172624348046, + "ewc_loss": 1.825392246246338e-07, + "grad_norm": 3.5205540657043457, + "learning_rate": 6.443408223823654e-08, + "loss": 0.6924, + "mean_token_accuracy": 0.7977025508880615, + "num_tokens": 5809627.0, + "step": 153 + }, + { + "epoch": 0.019590382902938557, + "ewc_loss": 1.825392246246338e-07, + "grad_norm": 3.2569923400878906, + "learning_rate": 6.485799067401441e-08, + "loss": 0.7477, + "mean_token_accuracy": 0.7829684019088745, + "num_tokens": 5847346.0, + "step": 154 + }, + { + "epoch": 0.019717593181529068, + "ewc_loss": 1.8533319234848022e-07, + "grad_norm": 3.756131410598755, + "learning_rate": 6.528189910979228e-08, + "loss": 0.7871, + "mean_token_accuracy": 0.7732771635055542, + "num_tokens": 5880631.0, + "step": 155 + }, + { + "epoch": 0.01984480346011958, + "ewc_loss": 1.8719583749771118e-07, + "grad_norm": 3.270681381225586, + "learning_rate": 6.570580754557016e-08, + "loss": 0.7481, + "mean_token_accuracy": 0.7849518060684204, + "num_tokens": 5919482.0, + "step": 156 + }, + { + "epoch": 0.019972013738710086, + "ewc_loss": 1.8998980522155762e-07, + "grad_norm": 3.279874801635742, + "learning_rate": 6.612971598134802e-08, + "loss": 0.7231, + "mean_token_accuracy": 0.7900403738021851, + "num_tokens": 5957126.0, + "step": 157 + }, + { + "epoch": 0.020099224017300597, + "ewc_loss": 1.9371509552001953e-07, + "grad_norm": 3.150615930557251, + "learning_rate": 6.655362441712589e-08, + "loss": 0.6986, + "mean_token_accuracy": 0.7946871519088745, + "num_tokens": 5994237.0, + "step": 158 + }, + { + "epoch": 0.020226434295891108, + "ewc_loss": 1.9744038581848145e-07, + "grad_norm": 3.328348159790039, + "learning_rate": 6.697753285290376e-08, + "loss": 0.7235, + "mean_token_accuracy": 0.7859162092208862, + "num_tokens": 6029093.0, + "step": 159 + }, + { + "epoch": 0.02035364457448162, + "ewc_loss": 1.993030309677124e-07, + "grad_norm": 3.1750965118408203, + "learning_rate": 6.740144128868163e-08, + "loss": 0.6734, + "mean_token_accuracy": 0.8027888536453247, + "num_tokens": 6066185.0, + "step": 160 + }, + { + "epoch": 0.020480854853072127, + "ewc_loss": 2.0302832126617432e-07, + "grad_norm": 2.9827380180358887, + "learning_rate": 6.78253497244595e-08, + "loss": 0.7254, + "mean_token_accuracy": 0.7900853157043457, + "num_tokens": 6107737.0, + "step": 161 + }, + { + "epoch": 0.020608065131662637, + "ewc_loss": 2.0675361156463623e-07, + "grad_norm": 3.369025945663452, + "learning_rate": 6.824925816023738e-08, + "loss": 0.7112, + "mean_token_accuracy": 0.7846246957778931, + "num_tokens": 6140277.0, + "step": 162 + }, + { + "epoch": 0.02073527541025315, + "ewc_loss": 2.0954757928848267e-07, + "grad_norm": 3.1746833324432373, + "learning_rate": 6.867316659601525e-08, + "loss": 0.7488, + "mean_token_accuracy": 0.7815950512886047, + "num_tokens": 6181627.0, + "step": 163 + }, + { + "epoch": 0.02086248568884366, + "ewc_loss": 2.1141022443771362e-07, + "grad_norm": 3.118027687072754, + "learning_rate": 6.909707503179312e-08, + "loss": 0.7444, + "mean_token_accuracy": 0.7860212326049805, + "num_tokens": 6222406.0, + "step": 164 + }, + { + "epoch": 0.02098969596743417, + "ewc_loss": 2.1327286958694458e-07, + "grad_norm": 2.964972972869873, + "learning_rate": 6.9520983467571e-08, + "loss": 0.7313, + "mean_token_accuracy": 0.7862666845321655, + "num_tokens": 6261796.0, + "step": 165 + }, + { + "epoch": 0.021116906246024678, + "ewc_loss": 2.169981598854065e-07, + "grad_norm": 3.5995819568634033, + "learning_rate": 6.994489190334887e-08, + "loss": 0.725, + "mean_token_accuracy": 0.7867592573165894, + "num_tokens": 6296514.0, + "step": 166 + }, + { + "epoch": 0.02124411652461519, + "ewc_loss": 2.169981598854065e-07, + "grad_norm": 2.908123254776001, + "learning_rate": 7.036880033912674e-08, + "loss": 0.7393, + "mean_token_accuracy": 0.7842776775360107, + "num_tokens": 6340455.0, + "step": 167 + }, + { + "epoch": 0.0213713268032057, + "ewc_loss": 2.1979212760925293e-07, + "grad_norm": 3.2366552352905273, + "learning_rate": 7.079270877490461e-08, + "loss": 0.7369, + "mean_token_accuracy": 0.7847998142242432, + "num_tokens": 6379707.0, + "step": 168 + }, + { + "epoch": 0.02149853708179621, + "ewc_loss": 2.2258609533309937e-07, + "grad_norm": 2.9845316410064697, + "learning_rate": 7.121661721068249e-08, + "loss": 0.711, + "mean_token_accuracy": 0.7954764366149902, + "num_tokens": 6420225.0, + "step": 169 + }, + { + "epoch": 0.021625747360386718, + "ewc_loss": 2.2631138563156128e-07, + "grad_norm": 3.5583887100219727, + "learning_rate": 7.164052564646036e-08, + "loss": 0.6954, + "mean_token_accuracy": 0.7940219044685364, + "num_tokens": 6451401.0, + "step": 170 + }, + { + "epoch": 0.02175295763897723, + "ewc_loss": 2.300366759300232e-07, + "grad_norm": 3.2359352111816406, + "learning_rate": 7.206443408223823e-08, + "loss": 0.8581, + "mean_token_accuracy": 0.7478452920913696, + "num_tokens": 6493360.0, + "step": 171 + }, + { + "epoch": 0.02188016791756774, + "ewc_loss": 2.3469328880310059e-07, + "grad_norm": 3.3393661975860596, + "learning_rate": 7.24883425180161e-08, + "loss": 0.7467, + "mean_token_accuracy": 0.7844182848930359, + "num_tokens": 6529103.0, + "step": 172 + }, + { + "epoch": 0.02200737819615825, + "ewc_loss": 2.3655593395233154e-07, + "grad_norm": 3.553778886795044, + "learning_rate": 7.291225095379398e-08, + "loss": 0.7659, + "mean_token_accuracy": 0.7793879508972168, + "num_tokens": 6565749.0, + "step": 173 + }, + { + "epoch": 0.02213458847474876, + "ewc_loss": 2.384185791015625e-07, + "grad_norm": 2.8224244117736816, + "learning_rate": 7.333615938957185e-08, + "loss": 0.7175, + "mean_token_accuracy": 0.7871043682098389, + "num_tokens": 6609991.0, + "step": 174 + }, + { + "epoch": 0.02226179875333927, + "ewc_loss": 2.384185791015625e-07, + "grad_norm": 3.4337871074676514, + "learning_rate": 7.376006782534971e-08, + "loss": 0.7133, + "mean_token_accuracy": 0.7897630929946899, + "num_tokens": 6641926.0, + "step": 175 + }, + { + "epoch": 0.02238900903192978, + "ewc_loss": 2.384185791015625e-07, + "grad_norm": 3.6122639179229736, + "learning_rate": 7.418397626112758e-08, + "loss": 0.7136, + "mean_token_accuracy": 0.7908457517623901, + "num_tokens": 6675761.0, + "step": 176 + }, + { + "epoch": 0.02251621931052029, + "ewc_loss": 2.384185791015625e-07, + "grad_norm": 3.150521755218506, + "learning_rate": 7.460788469690545e-08, + "loss": 0.7556, + "mean_token_accuracy": 0.7767940759658813, + "num_tokens": 6715012.0, + "step": 177 + }, + { + "epoch": 0.0226434295891108, + "ewc_loss": 2.421438694000244e-07, + "grad_norm": 3.1132655143737793, + "learning_rate": 7.503179313268333e-08, + "loss": 0.7069, + "mean_token_accuracy": 0.7915602326393127, + "num_tokens": 6754675.0, + "step": 178 + }, + { + "epoch": 0.02277063986770131, + "ewc_loss": 2.5890767574310303e-07, + "grad_norm": 4.718547821044922, + "learning_rate": 7.54557015684612e-08, + "loss": 0.6463, + "mean_token_accuracy": 0.810376763343811, + "num_tokens": 6793221.0, + "step": 179 + }, + { + "epoch": 0.02289785014629182, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 3.520482301712036, + "learning_rate": 7.587961000423907e-08, + "loss": 0.7573, + "mean_token_accuracy": 0.7768675088882446, + "num_tokens": 6831051.0, + "step": 180 + }, + { + "epoch": 0.02302506042488233, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 3.296269178390503, + "learning_rate": 7.630351844001694e-08, + "loss": 0.8015, + "mean_token_accuracy": 0.7609608769416809, + "num_tokens": 6867866.0, + "step": 181 + }, + { + "epoch": 0.02315227070347284, + "ewc_loss": 2.4400651454925537e-07, + "grad_norm": 3.0263702869415283, + "learning_rate": 7.672742687579482e-08, + "loss": 0.7456, + "mean_token_accuracy": 0.7816359400749207, + "num_tokens": 6908601.0, + "step": 182 + }, + { + "epoch": 0.02327948098206335, + "ewc_loss": 2.4586915969848633e-07, + "grad_norm": 3.0275633335113525, + "learning_rate": 7.715133531157269e-08, + "loss": 0.703, + "mean_token_accuracy": 0.7971014976501465, + "num_tokens": 6942088.0, + "step": 183 + }, + { + "epoch": 0.02340669126065386, + "ewc_loss": 2.477318048477173e-07, + "grad_norm": 3.396613121032715, + "learning_rate": 7.757524374735056e-08, + "loss": 0.7069, + "mean_token_accuracy": 0.791138768196106, + "num_tokens": 6978999.0, + "step": 184 + }, + { + "epoch": 0.02353390153924437, + "ewc_loss": 2.514570951461792e-07, + "grad_norm": 3.490004062652588, + "learning_rate": 7.799915218312844e-08, + "loss": 0.7069, + "mean_token_accuracy": 0.7902953028678894, + "num_tokens": 7010909.0, + "step": 185 + }, + { + "epoch": 0.02366111181783488, + "ewc_loss": 2.5704503059387207e-07, + "grad_norm": 3.0938782691955566, + "learning_rate": 7.842306061890631e-08, + "loss": 0.7254, + "mean_token_accuracy": 0.7876943349838257, + "num_tokens": 7049066.0, + "step": 186 + }, + { + "epoch": 0.023788322096425393, + "ewc_loss": 2.5890767574310303e-07, + "grad_norm": 4.170904636383057, + "learning_rate": 7.88469690546842e-08, + "loss": 0.7021, + "mean_token_accuracy": 0.7936943173408508, + "num_tokens": 7078335.0, + "step": 187 + }, + { + "epoch": 0.0239155323750159, + "ewc_loss": 2.6263296604156494e-07, + "grad_norm": 3.7643115520477295, + "learning_rate": 7.927087749046207e-08, + "loss": 0.7237, + "mean_token_accuracy": 0.7891124486923218, + "num_tokens": 7117165.0, + "step": 188 + }, + { + "epoch": 0.02404274265360641, + "ewc_loss": 2.7008354663848877e-07, + "grad_norm": 3.1152427196502686, + "learning_rate": 7.969478592623994e-08, + "loss": 0.6972, + "mean_token_accuracy": 0.7952052354812622, + "num_tokens": 7158409.0, + "step": 189 + }, + { + "epoch": 0.024169952932196922, + "ewc_loss": 2.775341272354126e-07, + "grad_norm": 4.051480293273926, + "learning_rate": 8.011869436201781e-08, + "loss": 0.6784, + "mean_token_accuracy": 0.7978914976119995, + "num_tokens": 7194317.0, + "step": 190 + }, + { + "epoch": 0.024297163210787433, + "ewc_loss": 3.0919909477233887e-07, + "grad_norm": 4.413509368896484, + "learning_rate": 8.054260279779568e-08, + "loss": 0.7228, + "mean_token_accuracy": 0.7852846384048462, + "num_tokens": 7236014.0, + "step": 191 + }, + { + "epoch": 0.02442437348937794, + "ewc_loss": 2.8312206268310547e-07, + "grad_norm": 3.597780704498291, + "learning_rate": 8.096651123357356e-08, + "loss": 0.6819, + "mean_token_accuracy": 0.7992464303970337, + "num_tokens": 7272399.0, + "step": 192 + }, + { + "epoch": 0.02455158376796845, + "ewc_loss": 2.9616057872772217e-07, + "grad_norm": 3.7703816890716553, + "learning_rate": 8.139041966935143e-08, + "loss": 0.69, + "mean_token_accuracy": 0.7945379614830017, + "num_tokens": 7312818.0, + "step": 193 + }, + { + "epoch": 0.024678794046558962, + "ewc_loss": 2.998858690261841e-07, + "grad_norm": 3.2181284427642822, + "learning_rate": 8.181432810512929e-08, + "loss": 0.7074, + "mean_token_accuracy": 0.7914304733276367, + "num_tokens": 7349554.0, + "step": 194 + }, + { + "epoch": 0.024806004325149473, + "ewc_loss": 2.998858690261841e-07, + "grad_norm": 4.26669454574585, + "learning_rate": 8.223823654090716e-08, + "loss": 0.7357, + "mean_token_accuracy": 0.7811651229858398, + "num_tokens": 7386803.0, + "step": 195 + }, + { + "epoch": 0.024933214603739984, + "ewc_loss": 3.0547380447387695e-07, + "grad_norm": 4.110103130340576, + "learning_rate": 8.266214497668503e-08, + "loss": 0.6926, + "mean_token_accuracy": 0.799270749092102, + "num_tokens": 7432539.0, + "step": 196 + }, + { + "epoch": 0.02506042488233049, + "ewc_loss": 3.0547380447387695e-07, + "grad_norm": 2.767810344696045, + "learning_rate": 8.30860534124629e-08, + "loss": 0.6765, + "mean_token_accuracy": 0.7971323728561401, + "num_tokens": 7470720.0, + "step": 197 + }, + { + "epoch": 0.025187635160921002, + "ewc_loss": 3.110617399215698e-07, + "grad_norm": 3.727743625640869, + "learning_rate": 8.350996184824078e-08, + "loss": 0.7006, + "mean_token_accuracy": 0.791334331035614, + "num_tokens": 7503988.0, + "step": 198 + }, + { + "epoch": 0.025314845439511513, + "ewc_loss": 3.1478703022003174e-07, + "grad_norm": 3.9764559268951416, + "learning_rate": 8.393387028401865e-08, + "loss": 0.6542, + "mean_token_accuracy": 0.8032568097114563, + "num_tokens": 7540871.0, + "step": 199 + }, + { + "epoch": 0.025442055718102024, + "ewc_loss": 3.203749656677246e-07, + "grad_norm": 4.0339789390563965, + "learning_rate": 8.435777871979652e-08, + "loss": 0.7234, + "mean_token_accuracy": 0.7864153981208801, + "num_tokens": 7580269.0, + "step": 200 + }, + { + "epoch": 0.02556926599669253, + "ewc_loss": 3.296881914138794e-07, + "grad_norm": 3.454458236694336, + "learning_rate": 8.47816871555744e-08, + "loss": 0.73, + "mean_token_accuracy": 0.7823070883750916, + "num_tokens": 7617089.0, + "step": 201 + }, + { + "epoch": 0.025696476275283042, + "ewc_loss": 3.296881914138794e-07, + "grad_norm": 3.4636054039001465, + "learning_rate": 8.520559559135227e-08, + "loss": 0.6589, + "mean_token_accuracy": 0.8058700561523438, + "num_tokens": 7656884.0, + "step": 202 + }, + { + "epoch": 0.025823686553873553, + "ewc_loss": 3.3527612686157227e-07, + "grad_norm": 4.096744060516357, + "learning_rate": 8.562950402713014e-08, + "loss": 0.7026, + "mean_token_accuracy": 0.792354941368103, + "num_tokens": 7696604.0, + "step": 203 + }, + { + "epoch": 0.025950896832464064, + "ewc_loss": 3.390014171600342e-07, + "grad_norm": 5.6788177490234375, + "learning_rate": 8.605341246290801e-08, + "loss": 0.7061, + "mean_token_accuracy": 0.7880312204360962, + "num_tokens": 7731357.0, + "step": 204 + }, + { + "epoch": 0.026078107111054575, + "ewc_loss": 3.4086406230926514e-07, + "grad_norm": 3.5532593727111816, + "learning_rate": 8.647732089868589e-08, + "loss": 0.7119, + "mean_token_accuracy": 0.788503110408783, + "num_tokens": 7765972.0, + "step": 205 + }, + { + "epoch": 0.026205317389645082, + "ewc_loss": 3.4458935260772705e-07, + "grad_norm": 3.4349424839019775, + "learning_rate": 8.690122933446376e-08, + "loss": 0.7138, + "mean_token_accuracy": 0.7947690486907959, + "num_tokens": 7806959.0, + "step": 206 + }, + { + "epoch": 0.026332527668235593, + "ewc_loss": 3.501772880554199e-07, + "grad_norm": 5.983587741851807, + "learning_rate": 8.732513777024163e-08, + "loss": 0.6848, + "mean_token_accuracy": 0.7994333505630493, + "num_tokens": 7842465.0, + "step": 207 + }, + { + "epoch": 0.026459737946826104, + "ewc_loss": 3.520399332046509e-07, + "grad_norm": 3.8286898136138916, + "learning_rate": 8.77490462060195e-08, + "loss": 0.7161, + "mean_token_accuracy": 0.7888085842132568, + "num_tokens": 7881583.0, + "step": 208 + }, + { + "epoch": 0.026586948225416615, + "ewc_loss": 3.520399332046509e-07, + "grad_norm": 2.935136318206787, + "learning_rate": 8.817295464179738e-08, + "loss": 0.64, + "mean_token_accuracy": 0.8089964389801025, + "num_tokens": 7923152.0, + "step": 209 + }, + { + "epoch": 0.026714158504007122, + "ewc_loss": 3.5390257835388184e-07, + "grad_norm": 3.773440361022949, + "learning_rate": 8.859686307757525e-08, + "loss": 0.7551, + "mean_token_accuracy": 0.7753371000289917, + "num_tokens": 7962466.0, + "step": 210 + }, + { + "epoch": 0.026841368782597633, + "ewc_loss": 3.594905138015747e-07, + "grad_norm": 4.579355239868164, + "learning_rate": 8.902077151335312e-08, + "loss": 0.714, + "mean_token_accuracy": 0.7880465984344482, + "num_tokens": 7999794.0, + "step": 211 + }, + { + "epoch": 0.026968579061188144, + "ewc_loss": 3.632158041000366e-07, + "grad_norm": 3.7023065090179443, + "learning_rate": 8.944467994913098e-08, + "loss": 0.6238, + "mean_token_accuracy": 0.8117336630821228, + "num_tokens": 8036568.0, + "step": 212 + }, + { + "epoch": 0.027095789339778655, + "ewc_loss": 3.632158041000366e-07, + "grad_norm": 3.3383262157440186, + "learning_rate": 8.986858838490885e-08, + "loss": 0.6689, + "mean_token_accuracy": 0.7942816019058228, + "num_tokens": 8073746.0, + "step": 213 + }, + { + "epoch": 0.027222999618369163, + "ewc_loss": 3.6694109439849854e-07, + "grad_norm": 3.4101076126098633, + "learning_rate": 9.029249682068673e-08, + "loss": 0.6534, + "mean_token_accuracy": 0.8020857572555542, + "num_tokens": 8109788.0, + "step": 214 + }, + { + "epoch": 0.027350209896959674, + "ewc_loss": 3.6694109439849854e-07, + "grad_norm": 3.2393507957458496, + "learning_rate": 9.07164052564646e-08, + "loss": 0.6362, + "mean_token_accuracy": 0.809645414352417, + "num_tokens": 8144547.0, + "step": 215 + }, + { + "epoch": 0.027477420175550184, + "ewc_loss": 3.6694109439849854e-07, + "grad_norm": 2.78863525390625, + "learning_rate": 9.114031369224247e-08, + "loss": 0.6415, + "mean_token_accuracy": 0.8111056089401245, + "num_tokens": 8193334.0, + "step": 216 + }, + { + "epoch": 0.027604630454140695, + "ewc_loss": 3.725290298461914e-07, + "grad_norm": 3.7002475261688232, + "learning_rate": 9.156422212802034e-08, + "loss": 0.71, + "mean_token_accuracy": 0.7862792015075684, + "num_tokens": 8232269.0, + "step": 217 + }, + { + "epoch": 0.027731840732731206, + "ewc_loss": 3.725290298461914e-07, + "grad_norm": 2.7947487831115723, + "learning_rate": 9.198813056379822e-08, + "loss": 0.7635, + "mean_token_accuracy": 0.7811777591705322, + "num_tokens": 8268124.0, + "step": 218 + }, + { + "epoch": 0.027859051011321714, + "ewc_loss": 3.725290298461914e-07, + "grad_norm": 3.2472305297851562, + "learning_rate": 9.241203899957609e-08, + "loss": 0.6645, + "mean_token_accuracy": 0.7997159957885742, + "num_tokens": 8305704.0, + "step": 219 + }, + { + "epoch": 0.027986261289912225, + "ewc_loss": 3.762543201446533e-07, + "grad_norm": 4.4298095703125, + "learning_rate": 9.283594743535396e-08, + "loss": 0.7463, + "mean_token_accuracy": 0.7750442624092102, + "num_tokens": 8344761.0, + "step": 220 + }, + { + "epoch": 0.028113471568502735, + "ewc_loss": 3.855675458908081e-07, + "grad_norm": 2.750868082046509, + "learning_rate": 9.325985587113183e-08, + "loss": 0.6216, + "mean_token_accuracy": 0.8171108365058899, + "num_tokens": 8383201.0, + "step": 221 + }, + { + "epoch": 0.028240681847093246, + "ewc_loss": 4.1350722312927246e-07, + "grad_norm": 5.027555465698242, + "learning_rate": 9.368376430690971e-08, + "loss": 0.6747, + "mean_token_accuracy": 0.7974488735198975, + "num_tokens": 8418688.0, + "step": 222 + }, + { + "epoch": 0.028367892125683754, + "ewc_loss": 3.91155481338501e-07, + "grad_norm": 5.150454998016357, + "learning_rate": 9.410767274268758e-08, + "loss": 0.5966, + "mean_token_accuracy": 0.82199627161026, + "num_tokens": 8455786.0, + "step": 223 + }, + { + "epoch": 0.028495102404274265, + "ewc_loss": 3.948807716369629e-07, + "grad_norm": 3.886653184890747, + "learning_rate": 9.453158117846545e-08, + "loss": 0.693, + "mean_token_accuracy": 0.7929286956787109, + "num_tokens": 8488007.0, + "step": 224 + }, + { + "epoch": 0.028622312682864776, + "ewc_loss": 4.0046870708465576e-07, + "grad_norm": 2.9363253116607666, + "learning_rate": 9.495548961424333e-08, + "loss": 0.6578, + "mean_token_accuracy": 0.8026283979415894, + "num_tokens": 8529586.0, + "step": 225 + }, + { + "epoch": 0.028749522961455286, + "ewc_loss": 4.153698682785034e-07, + "grad_norm": 4.501025199890137, + "learning_rate": 9.53793980500212e-08, + "loss": 0.6834, + "mean_token_accuracy": 0.7940537929534912, + "num_tokens": 8564912.0, + "step": 226 + }, + { + "epoch": 0.028876733240045797, + "ewc_loss": 4.041939973831177e-07, + "grad_norm": 4.18203067779541, + "learning_rate": 9.580330648579907e-08, + "loss": 0.6325, + "mean_token_accuracy": 0.8085822463035583, + "num_tokens": 8604268.0, + "step": 227 + }, + { + "epoch": 0.029003943518636305, + "ewc_loss": 4.079192876815796e-07, + "grad_norm": 3.590346574783325, + "learning_rate": 9.622721492157694e-08, + "loss": 0.6714, + "mean_token_accuracy": 0.7957388162612915, + "num_tokens": 8643935.0, + "step": 228 + }, + { + "epoch": 0.029131153797226816, + "ewc_loss": 4.0978193283081055e-07, + "grad_norm": 4.185186386108398, + "learning_rate": 9.665112335735482e-08, + "loss": 0.7458, + "mean_token_accuracy": 0.7787999510765076, + "num_tokens": 8677348.0, + "step": 229 + }, + { + "epoch": 0.029258364075817327, + "ewc_loss": 4.0978193283081055e-07, + "grad_norm": 4.015857696533203, + "learning_rate": 9.707503179313267e-08, + "loss": 0.6612, + "mean_token_accuracy": 0.7989256381988525, + "num_tokens": 8718747.0, + "step": 230 + }, + { + "epoch": 0.029385574354407838, + "ewc_loss": 4.153698682785034e-07, + "grad_norm": 3.5245940685272217, + "learning_rate": 9.749894022891055e-08, + "loss": 0.6501, + "mean_token_accuracy": 0.799597978591919, + "num_tokens": 8756317.0, + "step": 231 + }, + { + "epoch": 0.029512784632998345, + "ewc_loss": 4.153698682785034e-07, + "grad_norm": 5.048760414123535, + "learning_rate": 9.792284866468842e-08, + "loss": 0.6652, + "mean_token_accuracy": 0.8006918430328369, + "num_tokens": 8798903.0, + "step": 232 + }, + { + "epoch": 0.029639994911588856, + "ewc_loss": 4.1909515857696533e-07, + "grad_norm": 5.475481986999512, + "learning_rate": 9.834675710046629e-08, + "loss": 0.6648, + "mean_token_accuracy": 0.7984626293182373, + "num_tokens": 8839766.0, + "step": 233 + }, + { + "epoch": 0.029767205190179367, + "ewc_loss": 4.209578037261963e-07, + "grad_norm": 4.7532267570495605, + "learning_rate": 9.877066553624416e-08, + "loss": 0.6982, + "mean_token_accuracy": 0.7884323596954346, + "num_tokens": 8878542.0, + "step": 234 + }, + { + "epoch": 0.029894415468769878, + "ewc_loss": 4.246830940246582e-07, + "grad_norm": 3.2430388927459717, + "learning_rate": 9.919457397202204e-08, + "loss": 0.62, + "mean_token_accuracy": 0.8112847805023193, + "num_tokens": 8913426.0, + "step": 235 + }, + { + "epoch": 0.030021625747360385, + "ewc_loss": 4.246830940246582e-07, + "grad_norm": 3.8210835456848145, + "learning_rate": 9.961848240779991e-08, + "loss": 0.6541, + "mean_token_accuracy": 0.80268394947052, + "num_tokens": 8953593.0, + "step": 236 + }, + { + "epoch": 0.030148836025950896, + "ewc_loss": 4.2282044887542725e-07, + "grad_norm": 3.724609851837158, + "learning_rate": 1.0004239084357778e-07, + "loss": 0.6845, + "mean_token_accuracy": 0.7965402007102966, + "num_tokens": 8994076.0, + "step": 237 + }, + { + "epoch": 0.030276046304541407, + "ewc_loss": 4.246830940246582e-07, + "grad_norm": 3.9498941898345947, + "learning_rate": 1.0046629927935566e-07, + "loss": 0.6107, + "mean_token_accuracy": 0.8148542046546936, + "num_tokens": 9035109.0, + "step": 238 + }, + { + "epoch": 0.030403256583131918, + "ewc_loss": 4.246830940246582e-07, + "grad_norm": 4.751842975616455, + "learning_rate": 1.0089020771513353e-07, + "loss": 0.6368, + "mean_token_accuracy": 0.8095425367355347, + "num_tokens": 9073690.0, + "step": 239 + }, + { + "epoch": 0.03053046686172243, + "ewc_loss": 4.2654573917388916e-07, + "grad_norm": 3.58168363571167, + "learning_rate": 1.013141161509114e-07, + "loss": 0.5998, + "mean_token_accuracy": 0.8179998397827148, + "num_tokens": 9114416.0, + "step": 240 + }, + { + "epoch": 0.030657677140312936, + "ewc_loss": 4.302710294723511e-07, + "grad_norm": 3.755465507507324, + "learning_rate": 1.0173802458668927e-07, + "loss": 0.5912, + "mean_token_accuracy": 0.8211683630943298, + "num_tokens": 9150254.0, + "step": 241 + }, + { + "epoch": 0.030784887418903447, + "ewc_loss": 4.33996319770813e-07, + "grad_norm": 6.69765567779541, + "learning_rate": 1.0216193302246715e-07, + "loss": 0.6594, + "mean_token_accuracy": 0.7995933294296265, + "num_tokens": 9186122.0, + "step": 242 + }, + { + "epoch": 0.030912097697493958, + "ewc_loss": 4.3213367462158203e-07, + "grad_norm": 4.130110263824463, + "learning_rate": 1.0258584145824502e-07, + "loss": 0.7202, + "mean_token_accuracy": 0.7877663969993591, + "num_tokens": 9224080.0, + "step": 243 + }, + { + "epoch": 0.03103930797608447, + "ewc_loss": 4.3585896492004395e-07, + "grad_norm": 3.201904058456421, + "learning_rate": 1.0300974989402289e-07, + "loss": 0.644, + "mean_token_accuracy": 0.8095653057098389, + "num_tokens": 9268908.0, + "step": 244 + }, + { + "epoch": 0.031166518254674976, + "ewc_loss": 4.377216100692749e-07, + "grad_norm": 5.278276443481445, + "learning_rate": 1.0343365832980076e-07, + "loss": 0.6269, + "mean_token_accuracy": 0.8098541498184204, + "num_tokens": 9307461.0, + "step": 245 + }, + { + "epoch": 0.03129372853326549, + "ewc_loss": 4.377216100692749e-07, + "grad_norm": 5.025210857391357, + "learning_rate": 1.0385756676557864e-07, + "loss": 0.6991, + "mean_token_accuracy": 0.7877054214477539, + "num_tokens": 9347875.0, + "step": 246 + }, + { + "epoch": 0.031420938811855995, + "ewc_loss": 4.414469003677368e-07, + "grad_norm": 4.633707523345947, + "learning_rate": 1.0428147520135651e-07, + "loss": 0.6871, + "mean_token_accuracy": 0.7915256023406982, + "num_tokens": 9386288.0, + "step": 247 + }, + { + "epoch": 0.03154814909044651, + "ewc_loss": 4.4517219066619873e-07, + "grad_norm": 3.5541863441467285, + "learning_rate": 1.0470538363713437e-07, + "loss": 0.647, + "mean_token_accuracy": 0.8037606477737427, + "num_tokens": 9420281.0, + "step": 248 + }, + { + "epoch": 0.031675359369037016, + "ewc_loss": 4.4517219066619873e-07, + "grad_norm": 3.3455753326416016, + "learning_rate": 1.0512929207291224e-07, + "loss": 0.6308, + "mean_token_accuracy": 0.8075579404830933, + "num_tokens": 9460932.0, + "step": 249 + }, + { + "epoch": 0.03180256964762753, + "ewc_loss": 4.4889748096466064e-07, + "grad_norm": 4.443286418914795, + "learning_rate": 1.0555320050869011e-07, + "loss": 0.6073, + "mean_token_accuracy": 0.8167638182640076, + "num_tokens": 9499415.0, + "step": 250 + }, + { + "epoch": 0.03192977992621804, + "ewc_loss": 4.4889748096466064e-07, + "grad_norm": 3.9255945682525635, + "learning_rate": 1.0597710894446799e-07, + "loss": 0.6457, + "mean_token_accuracy": 0.8036038875579834, + "num_tokens": 9538314.0, + "step": 251 + }, + { + "epoch": 0.032056990204808546, + "ewc_loss": 4.5262277126312256e-07, + "grad_norm": 4.896951675415039, + "learning_rate": 1.0640101738024586e-07, + "loss": 0.7259, + "mean_token_accuracy": 0.7818572521209717, + "num_tokens": 9573643.0, + "step": 252 + }, + { + "epoch": 0.03218420048339906, + "ewc_loss": 4.5821070671081543e-07, + "grad_norm": 4.45403528213501, + "learning_rate": 1.0682492581602373e-07, + "loss": 0.6724, + "mean_token_accuracy": 0.8015353083610535, + "num_tokens": 9618799.0, + "step": 253 + }, + { + "epoch": 0.03231141076198957, + "ewc_loss": 4.5634806156158447e-07, + "grad_norm": 6.000646114349365, + "learning_rate": 1.072488342518016e-07, + "loss": 0.6247, + "mean_token_accuracy": 0.8115634918212891, + "num_tokens": 9658460.0, + "step": 254 + }, + { + "epoch": 0.03243862104058008, + "ewc_loss": 4.5821070671081543e-07, + "grad_norm": 6.186342239379883, + "learning_rate": 1.0767274268757948e-07, + "loss": 0.7212, + "mean_token_accuracy": 0.784306526184082, + "num_tokens": 9698015.0, + "step": 255 + }, + { + "epoch": 0.03256583131917059, + "ewc_loss": 4.600733518600464e-07, + "grad_norm": 3.6952991485595703, + "learning_rate": 1.0809665112335735e-07, + "loss": 0.6516, + "mean_token_accuracy": 0.8056342005729675, + "num_tokens": 9736284.0, + "step": 256 + }, + { + "epoch": 0.0326930415977611, + "ewc_loss": 4.6193599700927734e-07, + "grad_norm": 5.205880165100098, + "learning_rate": 1.0852055955913522e-07, + "loss": 0.7072, + "mean_token_accuracy": 0.7888765931129456, + "num_tokens": 9780131.0, + "step": 257 + }, + { + "epoch": 0.03282025187635161, + "ewc_loss": 4.6566128730773926e-07, + "grad_norm": 4.747349262237549, + "learning_rate": 1.089444679949131e-07, + "loss": 0.6911, + "mean_token_accuracy": 0.7897598743438721, + "num_tokens": 9819420.0, + "step": 258 + }, + { + "epoch": 0.03294746215494212, + "ewc_loss": 4.6938657760620117e-07, + "grad_norm": 3.713319778442383, + "learning_rate": 1.0936837643069097e-07, + "loss": 0.6148, + "mean_token_accuracy": 0.8119335174560547, + "num_tokens": 9858002.0, + "step": 259 + }, + { + "epoch": 0.03307467243353263, + "ewc_loss": 4.76837158203125e-07, + "grad_norm": 4.3074822425842285, + "learning_rate": 1.0979228486646884e-07, + "loss": 0.7056, + "mean_token_accuracy": 0.7869214415550232, + "num_tokens": 9894120.0, + "step": 260 + }, + { + "epoch": 0.03320188271212314, + "ewc_loss": 4.731118679046631e-07, + "grad_norm": 4.960076808929443, + "learning_rate": 1.1021619330224671e-07, + "loss": 0.7701, + "mean_token_accuracy": 0.7669531106948853, + "num_tokens": 9924849.0, + "step": 261 + }, + { + "epoch": 0.03332909299071365, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 6.7141194343566895, + "learning_rate": 1.1064010173802458e-07, + "loss": 0.6334, + "mean_token_accuracy": 0.806977391242981, + "num_tokens": 9960751.0, + "step": 262 + }, + { + "epoch": 0.03345630326930416, + "ewc_loss": 4.76837158203125e-07, + "grad_norm": 5.776894569396973, + "learning_rate": 1.1106401017380246e-07, + "loss": 0.656, + "mean_token_accuracy": 0.8017259240150452, + "num_tokens": 9997988.0, + "step": 263 + }, + { + "epoch": 0.03358351354789467, + "ewc_loss": 4.805624485015869e-07, + "grad_norm": 4.394406318664551, + "learning_rate": 1.1148791860958033e-07, + "loss": 0.6035, + "mean_token_accuracy": 0.8165450692176819, + "num_tokens": 10029286.0, + "step": 264 + }, + { + "epoch": 0.03371072382648518, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 4.228062629699707, + "learning_rate": 1.119118270453582e-07, + "loss": 0.6283, + "mean_token_accuracy": 0.8092474937438965, + "num_tokens": 10068216.0, + "step": 265 + }, + { + "epoch": 0.03383793410507569, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 4.4883623123168945, + "learning_rate": 1.1233573548113607e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8330689072608948, + "num_tokens": 10106729.0, + "step": 266 + }, + { + "epoch": 0.0339651443836662, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 6.274479389190674, + "learning_rate": 1.1275964391691393e-07, + "loss": 0.6883, + "mean_token_accuracy": 0.7937415242195129, + "num_tokens": 10144554.0, + "step": 267 + }, + { + "epoch": 0.03409235466225671, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 5.916316509246826, + "learning_rate": 1.131835523526918e-07, + "loss": 0.65, + "mean_token_accuracy": 0.8004387617111206, + "num_tokens": 10181300.0, + "step": 268 + }, + { + "epoch": 0.03421956494084722, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 4.301060676574707, + "learning_rate": 1.1360746078846968e-07, + "loss": 0.6186, + "mean_token_accuracy": 0.8150109052658081, + "num_tokens": 10218210.0, + "step": 269 + }, + { + "epoch": 0.03434677521943773, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 3.8964591026306152, + "learning_rate": 1.1403136922424755e-07, + "loss": 0.7022, + "mean_token_accuracy": 0.7844923734664917, + "num_tokens": 10254876.0, + "step": 270 + }, + { + "epoch": 0.03447398549802824, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 4.789085865020752, + "learning_rate": 1.1445527766002542e-07, + "loss": 0.6642, + "mean_token_accuracy": 0.80047607421875, + "num_tokens": 10296679.0, + "step": 271 + }, + { + "epoch": 0.03460119577661875, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 5.821712017059326, + "learning_rate": 1.148791860958033e-07, + "loss": 0.7057, + "mean_token_accuracy": 0.7877573370933533, + "num_tokens": 10333286.0, + "step": 272 + }, + { + "epoch": 0.034728406055209264, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 8.158778190612793, + "learning_rate": 1.1530309453158117e-07, + "loss": 0.7004, + "mean_token_accuracy": 0.7863149046897888, + "num_tokens": 10362425.0, + "step": 273 + }, + { + "epoch": 0.03485561633379977, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 4.801184177398682, + "learning_rate": 1.1572700296735904e-07, + "loss": 0.6949, + "mean_token_accuracy": 0.7897822260856628, + "num_tokens": 10401629.0, + "step": 274 + }, + { + "epoch": 0.03498282661239028, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 5.301501750946045, + "learning_rate": 1.1615091140313691e-07, + "loss": 0.6608, + "mean_token_accuracy": 0.8033064603805542, + "num_tokens": 10439333.0, + "step": 275 + }, + { + "epoch": 0.03511003689098079, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 3.4595727920532227, + "learning_rate": 1.1657481983891479e-07, + "loss": 0.6288, + "mean_token_accuracy": 0.8114580512046814, + "num_tokens": 10476170.0, + "step": 276 + }, + { + "epoch": 0.0352372471695713, + "ewc_loss": 4.842877388000488e-07, + "grad_norm": 4.661934852600098, + "learning_rate": 1.1699872827469266e-07, + "loss": 0.6451, + "mean_token_accuracy": 0.80165034532547, + "num_tokens": 10513298.0, + "step": 277 + }, + { + "epoch": 0.03536445744816181, + "ewc_loss": 4.880130290985107e-07, + "grad_norm": 4.889813423156738, + "learning_rate": 1.1742263671047053e-07, + "loss": 0.6741, + "mean_token_accuracy": 0.7968564033508301, + "num_tokens": 10551149.0, + "step": 278 + }, + { + "epoch": 0.03549166772675232, + "ewc_loss": 4.880130290985107e-07, + "grad_norm": 5.1986799240112305, + "learning_rate": 1.178465451462484e-07, + "loss": 0.6154, + "mean_token_accuracy": 0.8122014999389648, + "num_tokens": 10590034.0, + "step": 279 + }, + { + "epoch": 0.03561887800534283, + "ewc_loss": 4.880130290985107e-07, + "grad_norm": 4.504305362701416, + "learning_rate": 1.1827045358202628e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.8217391967773438, + "num_tokens": 10626670.0, + "step": 280 + }, + { + "epoch": 0.035746088283933344, + "ewc_loss": 4.880130290985107e-07, + "grad_norm": 4.278409004211426, + "learning_rate": 1.1869436201780415e-07, + "loss": 0.6449, + "mean_token_accuracy": 0.7995141744613647, + "num_tokens": 10661319.0, + "step": 281 + }, + { + "epoch": 0.03587329856252385, + "ewc_loss": 4.954636096954346e-07, + "grad_norm": 4.601391315460205, + "learning_rate": 1.1911827045358202e-07, + "loss": 0.6711, + "mean_token_accuracy": 0.805463433265686, + "num_tokens": 10700705.0, + "step": 282 + }, + { + "epoch": 0.03600050884111436, + "ewc_loss": 5.066394805908203e-07, + "grad_norm": 6.123531818389893, + "learning_rate": 1.195421788893599e-07, + "loss": 0.6507, + "mean_token_accuracy": 0.80202317237854, + "num_tokens": 10734985.0, + "step": 283 + }, + { + "epoch": 0.036127719119704874, + "ewc_loss": 4.991888999938965e-07, + "grad_norm": 5.734160900115967, + "learning_rate": 1.1996608732513778e-07, + "loss": 0.6295, + "mean_token_accuracy": 0.8087038993835449, + "num_tokens": 10775868.0, + "step": 284 + }, + { + "epoch": 0.03625492939829538, + "ewc_loss": 4.991888999938965e-07, + "grad_norm": 4.619617462158203, + "learning_rate": 1.2038999576091563e-07, + "loss": 0.602, + "mean_token_accuracy": 0.8166011571884155, + "num_tokens": 10815638.0, + "step": 285 + }, + { + "epoch": 0.036382139676885895, + "ewc_loss": 4.991888999938965e-07, + "grad_norm": 5.2873759269714355, + "learning_rate": 1.208139041966935e-07, + "loss": 0.6239, + "mean_token_accuracy": 0.8109452128410339, + "num_tokens": 10853402.0, + "step": 286 + }, + { + "epoch": 0.0365093499554764, + "ewc_loss": 5.029141902923584e-07, + "grad_norm": 4.89241886138916, + "learning_rate": 1.2123781263247137e-07, + "loss": 0.6553, + "mean_token_accuracy": 0.8012163639068604, + "num_tokens": 10887031.0, + "step": 287 + }, + { + "epoch": 0.03663656023406691, + "ewc_loss": 5.066394805908203e-07, + "grad_norm": 5.7362823486328125, + "learning_rate": 1.2166172106824924e-07, + "loss": 0.6474, + "mean_token_accuracy": 0.8021619319915771, + "num_tokens": 10924568.0, + "step": 288 + }, + { + "epoch": 0.036763770512657425, + "ewc_loss": 5.103647708892822e-07, + "grad_norm": 5.1921186447143555, + "learning_rate": 1.2208562950402712e-07, + "loss": 0.6889, + "mean_token_accuracy": 0.7924560308456421, + "num_tokens": 10959794.0, + "step": 289 + }, + { + "epoch": 0.03689098079124793, + "ewc_loss": 5.103647708892822e-07, + "grad_norm": 5.871106147766113, + "learning_rate": 1.22509537939805e-07, + "loss": 0.7104, + "mean_token_accuracy": 0.785965621471405, + "num_tokens": 10998746.0, + "step": 290 + }, + { + "epoch": 0.03701819106983844, + "ewc_loss": 5.103647708892822e-07, + "grad_norm": 3.845465898513794, + "learning_rate": 1.2293344637558286e-07, + "loss": 0.6107, + "mean_token_accuracy": 0.8117653131484985, + "num_tokens": 11038232.0, + "step": 291 + }, + { + "epoch": 0.037145401348428954, + "ewc_loss": 5.103647708892822e-07, + "grad_norm": 3.8155453205108643, + "learning_rate": 1.2335735481136073e-07, + "loss": 0.6457, + "mean_token_accuracy": 0.8048034906387329, + "num_tokens": 11075448.0, + "step": 292 + }, + { + "epoch": 0.03727261162701946, + "ewc_loss": 5.103647708892822e-07, + "grad_norm": 4.373762130737305, + "learning_rate": 1.237812632471386e-07, + "loss": 0.644, + "mean_token_accuracy": 0.8075552582740784, + "num_tokens": 11118255.0, + "step": 293 + }, + { + "epoch": 0.037399821905609976, + "ewc_loss": 5.103647708892822e-07, + "grad_norm": 3.5635364055633545, + "learning_rate": 1.2420517168291648e-07, + "loss": 0.598, + "mean_token_accuracy": 0.8169007897377014, + "num_tokens": 11157661.0, + "step": 294 + }, + { + "epoch": 0.03752703218420048, + "ewc_loss": 5.140900611877441e-07, + "grad_norm": 3.7978198528289795, + "learning_rate": 1.2462908011869435e-07, + "loss": 0.6533, + "mean_token_accuracy": 0.8001406192779541, + "num_tokens": 11196868.0, + "step": 295 + }, + { + "epoch": 0.03765424246279099, + "ewc_loss": 5.21540641784668e-07, + "grad_norm": 5.0593485832214355, + "learning_rate": 1.2505298855447223e-07, + "loss": 0.583, + "mean_token_accuracy": 0.8206794261932373, + "num_tokens": 11235143.0, + "step": 296 + }, + { + "epoch": 0.037781452741381505, + "ewc_loss": 5.178153514862061e-07, + "grad_norm": 3.7522120475769043, + "learning_rate": 1.254768969902501e-07, + "loss": 0.6697, + "mean_token_accuracy": 0.7941621541976929, + "num_tokens": 11280292.0, + "step": 297 + }, + { + "epoch": 0.03790866301997201, + "ewc_loss": 5.289912223815918e-07, + "grad_norm": 4.4123992919921875, + "learning_rate": 1.2590080542602797e-07, + "loss": 0.62, + "mean_token_accuracy": 0.813015341758728, + "num_tokens": 11323359.0, + "step": 298 + }, + { + "epoch": 0.03803587329856253, + "ewc_loss": 5.289912223815918e-07, + "grad_norm": 4.315871715545654, + "learning_rate": 1.2632471386180584e-07, + "loss": 0.6195, + "mean_token_accuracy": 0.811742901802063, + "num_tokens": 11362142.0, + "step": 299 + }, + { + "epoch": 0.038163083577153034, + "ewc_loss": 5.401670932769775e-07, + "grad_norm": 5.513349533081055, + "learning_rate": 1.2674862229758372e-07, + "loss": 0.6326, + "mean_token_accuracy": 0.8047816753387451, + "num_tokens": 11395754.0, + "step": 300 + }, + { + "epoch": 0.03829029385574354, + "ewc_loss": 5.550682544708252e-07, + "grad_norm": 3.785338878631592, + "learning_rate": 1.271725307333616e-07, + "loss": 0.6152, + "mean_token_accuracy": 0.8145062923431396, + "num_tokens": 11433947.0, + "step": 301 + }, + { + "epoch": 0.038417504134334056, + "ewc_loss": 5.438923835754395e-07, + "grad_norm": 5.424266338348389, + "learning_rate": 1.2759643916913946e-07, + "loss": 0.6756, + "mean_token_accuracy": 0.790596604347229, + "num_tokens": 11471300.0, + "step": 302 + }, + { + "epoch": 0.03854471441292456, + "ewc_loss": 5.476176738739014e-07, + "grad_norm": 4.4764299392700195, + "learning_rate": 1.2802034760491733e-07, + "loss": 0.6221, + "mean_token_accuracy": 0.8089026212692261, + "num_tokens": 11509259.0, + "step": 303 + }, + { + "epoch": 0.03867192469151508, + "ewc_loss": 5.513429641723633e-07, + "grad_norm": 4.743232250213623, + "learning_rate": 1.284442560406952e-07, + "loss": 0.6362, + "mean_token_accuracy": 0.80567467212677, + "num_tokens": 11542499.0, + "step": 304 + }, + { + "epoch": 0.038799134970105585, + "ewc_loss": 5.550682544708252e-07, + "grad_norm": 5.335220813751221, + "learning_rate": 1.2886816447647308e-07, + "loss": 0.655, + "mean_token_accuracy": 0.7996744513511658, + "num_tokens": 11583260.0, + "step": 305 + }, + { + "epoch": 0.03892634524869609, + "ewc_loss": 5.587935447692871e-07, + "grad_norm": 5.999854564666748, + "learning_rate": 1.2929207291225095e-07, + "loss": 0.6515, + "mean_token_accuracy": 0.7991554737091064, + "num_tokens": 11617778.0, + "step": 306 + }, + { + "epoch": 0.03905355552728661, + "ewc_loss": 5.587935447692871e-07, + "grad_norm": 5.142443656921387, + "learning_rate": 1.2971598134802882e-07, + "loss": 0.6911, + "mean_token_accuracy": 0.7894093990325928, + "num_tokens": 11650314.0, + "step": 307 + }, + { + "epoch": 0.039180765805877114, + "ewc_loss": 5.587935447692871e-07, + "grad_norm": 5.1604905128479, + "learning_rate": 1.301398897838067e-07, + "loss": 0.553, + "mean_token_accuracy": 0.8244826793670654, + "num_tokens": 11687352.0, + "step": 308 + }, + { + "epoch": 0.03930797608446762, + "ewc_loss": 5.662441253662109e-07, + "grad_norm": 4.886850357055664, + "learning_rate": 1.3056379821958457e-07, + "loss": 0.6516, + "mean_token_accuracy": 0.8000390529632568, + "num_tokens": 11724799.0, + "step": 309 + }, + { + "epoch": 0.039435186363058136, + "ewc_loss": 5.662441253662109e-07, + "grad_norm": 3.971693515777588, + "learning_rate": 1.3098770665536244e-07, + "loss": 0.648, + "mean_token_accuracy": 0.8038218021392822, + "num_tokens": 11769211.0, + "step": 310 + }, + { + "epoch": 0.039562396641648644, + "ewc_loss": 5.736947059631348e-07, + "grad_norm": 5.643726825714111, + "learning_rate": 1.3141161509114031e-07, + "loss": 0.6758, + "mean_token_accuracy": 0.7904808521270752, + "num_tokens": 11801881.0, + "step": 311 + }, + { + "epoch": 0.03968960692023916, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 3.313868284225464, + "learning_rate": 1.3183552352691819e-07, + "loss": 0.6155, + "mean_token_accuracy": 0.8116620182991028, + "num_tokens": 11838640.0, + "step": 312 + }, + { + "epoch": 0.039816817198829665, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 4.068597316741943, + "learning_rate": 1.3225943196269603e-07, + "loss": 0.6975, + "mean_token_accuracy": 0.7869770526885986, + "num_tokens": 11874443.0, + "step": 313 + }, + { + "epoch": 0.03994402747742017, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 3.6097469329833984, + "learning_rate": 1.3268334039847393e-07, + "loss": 0.6022, + "mean_token_accuracy": 0.8159560561180115, + "num_tokens": 11915852.0, + "step": 314 + }, + { + "epoch": 0.04007123775601069, + "ewc_loss": 5.774199962615967e-07, + "grad_norm": 4.074393272399902, + "learning_rate": 1.3310724883425178e-07, + "loss": 0.6026, + "mean_token_accuracy": 0.8151445388793945, + "num_tokens": 11952224.0, + "step": 315 + }, + { + "epoch": 0.040198448034601195, + "ewc_loss": 5.811452865600586e-07, + "grad_norm": 5.027655601501465, + "learning_rate": 1.3353115727002968e-07, + "loss": 0.6582, + "mean_token_accuracy": 0.798538327217102, + "num_tokens": 11984589.0, + "step": 316 + }, + { + "epoch": 0.04032565831319171, + "ewc_loss": 5.848705768585205e-07, + "grad_norm": 3.88228440284729, + "learning_rate": 1.3395506570580752e-07, + "loss": 0.5763, + "mean_token_accuracy": 0.818403959274292, + "num_tokens": 12019568.0, + "step": 317 + }, + { + "epoch": 0.040452868591782216, + "ewc_loss": 5.885958671569824e-07, + "grad_norm": 4.661266326904297, + "learning_rate": 1.3437897414158542e-07, + "loss": 0.6287, + "mean_token_accuracy": 0.8058995604515076, + "num_tokens": 12050095.0, + "step": 318 + }, + { + "epoch": 0.040580078870372724, + "ewc_loss": 5.885958671569824e-07, + "grad_norm": 3.7926199436187744, + "learning_rate": 1.3480288257736327e-07, + "loss": 0.6621, + "mean_token_accuracy": 0.8002058863639832, + "num_tokens": 12086243.0, + "step": 319 + }, + { + "epoch": 0.04070728914896324, + "ewc_loss": 5.923211574554443e-07, + "grad_norm": 3.869443655014038, + "learning_rate": 1.3522679101314117e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.8323455452919006, + "num_tokens": 12121597.0, + "step": 320 + }, + { + "epoch": 0.040834499427553746, + "ewc_loss": 6.109476089477539e-07, + "grad_norm": 5.463377475738525, + "learning_rate": 1.35650699448919e-07, + "loss": 0.6266, + "mean_token_accuracy": 0.8076192140579224, + "num_tokens": 12154714.0, + "step": 321 + }, + { + "epoch": 0.04096170970614425, + "ewc_loss": 5.923211574554443e-07, + "grad_norm": 3.851505756378174, + "learning_rate": 1.360746078846969e-07, + "loss": 0.604, + "mean_token_accuracy": 0.8118230700492859, + "num_tokens": 12189725.0, + "step": 322 + }, + { + "epoch": 0.04108891998473477, + "ewc_loss": 5.923211574554443e-07, + "grad_norm": 4.167165756225586, + "learning_rate": 1.3649851632047476e-07, + "loss": 0.6364, + "mean_token_accuracy": 0.8044705390930176, + "num_tokens": 12229328.0, + "step": 323 + }, + { + "epoch": 0.041216130263325275, + "ewc_loss": 5.960464477539062e-07, + "grad_norm": 3.6854515075683594, + "learning_rate": 1.3692242475625266e-07, + "loss": 0.659, + "mean_token_accuracy": 0.8037538528442383, + "num_tokens": 12267825.0, + "step": 324 + }, + { + "epoch": 0.04134334054191579, + "ewc_loss": 5.997717380523682e-07, + "grad_norm": 3.9770143032073975, + "learning_rate": 1.373463331920305e-07, + "loss": 0.6432, + "mean_token_accuracy": 0.8048826456069946, + "num_tokens": 12305285.0, + "step": 325 + }, + { + "epoch": 0.0414705508205063, + "ewc_loss": 5.960464477539062e-07, + "grad_norm": 3.871513605117798, + "learning_rate": 1.377702416278084e-07, + "loss": 0.6415, + "mean_token_accuracy": 0.8043287992477417, + "num_tokens": 12342919.0, + "step": 326 + }, + { + "epoch": 0.041597761099096804, + "ewc_loss": 5.960464477539062e-07, + "grad_norm": 3.6065962314605713, + "learning_rate": 1.3819415006358625e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8256920576095581, + "num_tokens": 12380318.0, + "step": 327 + }, + { + "epoch": 0.04172497137768732, + "ewc_loss": 5.997717380523682e-07, + "grad_norm": 4.4641618728637695, + "learning_rate": 1.3861805849936415e-07, + "loss": 0.6574, + "mean_token_accuracy": 0.7979960441589355, + "num_tokens": 12418936.0, + "step": 328 + }, + { + "epoch": 0.041852181656277826, + "ewc_loss": 5.960464477539062e-07, + "grad_norm": 3.8009705543518066, + "learning_rate": 1.39041966935142e-07, + "loss": 0.6902, + "mean_token_accuracy": 0.7874214053153992, + "num_tokens": 12459537.0, + "step": 329 + }, + { + "epoch": 0.04197939193486834, + "ewc_loss": 6.07222318649292e-07, + "grad_norm": 3.1286673545837402, + "learning_rate": 1.394658753709199e-07, + "loss": 0.645, + "mean_token_accuracy": 0.8005847334861755, + "num_tokens": 12497283.0, + "step": 330 + }, + { + "epoch": 0.04210660221345885, + "ewc_loss": 6.07222318649292e-07, + "grad_norm": 3.55623722076416, + "learning_rate": 1.3988978380669774e-07, + "loss": 0.599, + "mean_token_accuracy": 0.814693033695221, + "num_tokens": 12536782.0, + "step": 331 + }, + { + "epoch": 0.042233812492049355, + "ewc_loss": 6.034970283508301e-07, + "grad_norm": 3.8292839527130127, + "learning_rate": 1.403136922424756e-07, + "loss": 0.6352, + "mean_token_accuracy": 0.8029751777648926, + "num_tokens": 12574283.0, + "step": 332 + }, + { + "epoch": 0.04236102277063987, + "ewc_loss": 6.109476089477539e-07, + "grad_norm": 4.350155353546143, + "learning_rate": 1.4073760067825348e-07, + "loss": 0.6825, + "mean_token_accuracy": 0.7906941175460815, + "num_tokens": 12615368.0, + "step": 333 + }, + { + "epoch": 0.04248823304923038, + "ewc_loss": 6.221234798431396e-07, + "grad_norm": 4.505923271179199, + "learning_rate": 1.4116150911403136e-07, + "loss": 0.6406, + "mean_token_accuracy": 0.8034814596176147, + "num_tokens": 12657344.0, + "step": 334 + }, + { + "epoch": 0.04261544332782089, + "ewc_loss": 6.258487701416016e-07, + "grad_norm": 3.2546987533569336, + "learning_rate": 1.4158541754980923e-07, + "loss": 0.6155, + "mean_token_accuracy": 0.8073910474777222, + "num_tokens": 12693142.0, + "step": 335 + }, + { + "epoch": 0.0427426536064114, + "ewc_loss": 6.332993507385254e-07, + "grad_norm": 3.751112222671509, + "learning_rate": 1.420093259855871e-07, + "loss": 0.5701, + "mean_token_accuracy": 0.8234869837760925, + "num_tokens": 12735102.0, + "step": 336 + }, + { + "epoch": 0.042869863885001906, + "ewc_loss": 6.332993507385254e-07, + "grad_norm": 4.185041427612305, + "learning_rate": 1.4243323442136497e-07, + "loss": 0.6171, + "mean_token_accuracy": 0.8115221261978149, + "num_tokens": 12779017.0, + "step": 337 + }, + { + "epoch": 0.04299707416359242, + "ewc_loss": 6.370246410369873e-07, + "grad_norm": 3.589888095855713, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.6589, + "mean_token_accuracy": 0.7998425960540771, + "num_tokens": 12816068.0, + "step": 338 + }, + { + "epoch": 0.04312428444218293, + "ewc_loss": 6.370246410369873e-07, + "grad_norm": 3.7698514461517334, + "learning_rate": 1.4328105129292072e-07, + "loss": 0.5967, + "mean_token_accuracy": 0.8170826435089111, + "num_tokens": 12850937.0, + "step": 339 + }, + { + "epoch": 0.043251494720773435, + "ewc_loss": 6.407499313354492e-07, + "grad_norm": 4.208564281463623, + "learning_rate": 1.437049597286986e-07, + "loss": 0.6072, + "mean_token_accuracy": 0.8144179582595825, + "num_tokens": 12887067.0, + "step": 340 + }, + { + "epoch": 0.04337870499936395, + "ewc_loss": 6.444752216339111e-07, + "grad_norm": 4.640837669372559, + "learning_rate": 1.4412886816447646e-07, + "loss": 0.682, + "mean_token_accuracy": 0.792753279209137, + "num_tokens": 12922944.0, + "step": 341 + }, + { + "epoch": 0.04350591527795446, + "ewc_loss": 6.444752216339111e-07, + "grad_norm": 3.856052875518799, + "learning_rate": 1.4455277660025434e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.8309961557388306, + "num_tokens": 12954992.0, + "step": 342 + }, + { + "epoch": 0.04363312555654497, + "ewc_loss": 6.556510925292969e-07, + "grad_norm": 3.1530508995056152, + "learning_rate": 1.449766850360322e-07, + "loss": 0.5957, + "mean_token_accuracy": 0.8173094391822815, + "num_tokens": 13002183.0, + "step": 343 + }, + { + "epoch": 0.04376033583513548, + "ewc_loss": 6.48200511932373e-07, + "grad_norm": 4.880069255828857, + "learning_rate": 1.4540059347181008e-07, + "loss": 0.5925, + "mean_token_accuracy": 0.8176546096801758, + "num_tokens": 13039160.0, + "step": 344 + }, + { + "epoch": 0.043887546113725986, + "ewc_loss": 6.631016731262207e-07, + "grad_norm": 4.166593074798584, + "learning_rate": 1.4582450190758795e-07, + "loss": 0.6077, + "mean_token_accuracy": 0.8115376234054565, + "num_tokens": 13079139.0, + "step": 345 + }, + { + "epoch": 0.0440147563923165, + "ewc_loss": 6.631016731262207e-07, + "grad_norm": 3.756916046142578, + "learning_rate": 1.4624841034336583e-07, + "loss": 0.6369, + "mean_token_accuracy": 0.8003906011581421, + "num_tokens": 13114351.0, + "step": 346 + }, + { + "epoch": 0.04414196667090701, + "ewc_loss": 6.668269634246826e-07, + "grad_norm": 5.056085109710693, + "learning_rate": 1.466723187791437e-07, + "loss": 0.6141, + "mean_token_accuracy": 0.8108057975769043, + "num_tokens": 13149700.0, + "step": 347 + }, + { + "epoch": 0.04426917694949752, + "ewc_loss": 6.705522537231445e-07, + "grad_norm": 3.737074613571167, + "learning_rate": 1.4709622721492157e-07, + "loss": 0.6624, + "mean_token_accuracy": 0.8007044792175293, + "num_tokens": 13190741.0, + "step": 348 + }, + { + "epoch": 0.04439638722808803, + "ewc_loss": 6.780028343200684e-07, + "grad_norm": 3.804999351501465, + "learning_rate": 1.4752013565069942e-07, + "loss": 0.5615, + "mean_token_accuracy": 0.8257667422294617, + "num_tokens": 13235447.0, + "step": 349 + }, + { + "epoch": 0.04452359750667854, + "ewc_loss": 6.780028343200684e-07, + "grad_norm": 3.7087574005126953, + "learning_rate": 1.4794404408647732e-07, + "loss": 0.6169, + "mean_token_accuracy": 0.8089456558227539, + "num_tokens": 13279256.0, + "step": 350 + }, + { + "epoch": 0.04465080778526905, + "ewc_loss": 6.780028343200684e-07, + "grad_norm": 4.622411727905273, + "learning_rate": 1.4836795252225516e-07, + "loss": 0.6366, + "mean_token_accuracy": 0.8002209663391113, + "num_tokens": 13314796.0, + "step": 351 + }, + { + "epoch": 0.04477801806385956, + "ewc_loss": 6.854534149169922e-07, + "grad_norm": 3.781181573867798, + "learning_rate": 1.4879186095803306e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.8259122371673584, + "num_tokens": 13355479.0, + "step": 352 + }, + { + "epoch": 0.04490522834245007, + "ewc_loss": 6.891787052154541e-07, + "grad_norm": 3.4625473022460938, + "learning_rate": 1.492157693938109e-07, + "loss": 0.6359, + "mean_token_accuracy": 0.8085002899169922, + "num_tokens": 13396185.0, + "step": 353 + }, + { + "epoch": 0.04503243862104058, + "ewc_loss": 6.891787052154541e-07, + "grad_norm": 3.4893198013305664, + "learning_rate": 1.496396778295888e-07, + "loss": 0.5961, + "mean_token_accuracy": 0.813057541847229, + "num_tokens": 13445219.0, + "step": 354 + }, + { + "epoch": 0.04515964889963109, + "ewc_loss": 6.891787052154541e-07, + "grad_norm": 3.887721538543701, + "learning_rate": 1.5006358626536665e-07, + "loss": 0.5913, + "mean_token_accuracy": 0.81715989112854, + "num_tokens": 13483537.0, + "step": 355 + }, + { + "epoch": 0.0452868591782216, + "ewc_loss": 6.966292858123779e-07, + "grad_norm": 3.868173360824585, + "learning_rate": 1.5048749470114455e-07, + "loss": 0.5889, + "mean_token_accuracy": 0.8130265474319458, + "num_tokens": 13517969.0, + "step": 356 + }, + { + "epoch": 0.04541406945681211, + "ewc_loss": 6.92903995513916e-07, + "grad_norm": 4.5516581535339355, + "learning_rate": 1.509114031369224e-07, + "loss": 0.5657, + "mean_token_accuracy": 0.8245593905448914, + "num_tokens": 13553082.0, + "step": 357 + }, + { + "epoch": 0.04554127973540262, + "ewc_loss": 6.966292858123779e-07, + "grad_norm": 3.9642114639282227, + "learning_rate": 1.513353115727003e-07, + "loss": 0.6504, + "mean_token_accuracy": 0.801701545715332, + "num_tokens": 13593189.0, + "step": 358 + }, + { + "epoch": 0.04566849001399313, + "ewc_loss": 7.003545761108398e-07, + "grad_norm": 3.918199300765991, + "learning_rate": 1.5175922000847814e-07, + "loss": 0.6478, + "mean_token_accuracy": 0.8041318655014038, + "num_tokens": 13628318.0, + "step": 359 + }, + { + "epoch": 0.04579570029258364, + "ewc_loss": 7.040798664093018e-07, + "grad_norm": 4.135272026062012, + "learning_rate": 1.5218312844425604e-07, + "loss": 0.5949, + "mean_token_accuracy": 0.8082947134971619, + "num_tokens": 13662621.0, + "step": 360 + }, + { + "epoch": 0.045922910571174154, + "ewc_loss": 7.115304470062256e-07, + "grad_norm": 3.936858654022217, + "learning_rate": 1.526070368800339e-07, + "loss": 0.6026, + "mean_token_accuracy": 0.8115317225456238, + "num_tokens": 13700329.0, + "step": 361 + }, + { + "epoch": 0.04605012084976466, + "ewc_loss": 7.152557373046875e-07, + "grad_norm": 4.588818073272705, + "learning_rate": 1.530309453158118e-07, + "loss": 0.6395, + "mean_token_accuracy": 0.8013383150100708, + "num_tokens": 13738916.0, + "step": 362 + }, + { + "epoch": 0.04617733112835517, + "ewc_loss": 7.189810276031494e-07, + "grad_norm": 3.610457181930542, + "learning_rate": 1.5345485375158963e-07, + "loss": 0.6273, + "mean_token_accuracy": 0.8015140295028687, + "num_tokens": 13776971.0, + "step": 363 + }, + { + "epoch": 0.04630454140694568, + "ewc_loss": 7.189810276031494e-07, + "grad_norm": 4.02097749710083, + "learning_rate": 1.5387876218736753e-07, + "loss": 0.5942, + "mean_token_accuracy": 0.8162314891815186, + "num_tokens": 13815144.0, + "step": 364 + }, + { + "epoch": 0.04643175168553619, + "ewc_loss": 7.264316082000732e-07, + "grad_norm": 3.3971314430236816, + "learning_rate": 1.5430267062314538e-07, + "loss": 0.5761, + "mean_token_accuracy": 0.8209737539291382, + "num_tokens": 13855000.0, + "step": 365 + }, + { + "epoch": 0.0465589619641267, + "ewc_loss": 7.264316082000732e-07, + "grad_norm": 4.506768226623535, + "learning_rate": 1.5472657905892328e-07, + "loss": 0.6833, + "mean_token_accuracy": 0.7961810231208801, + "num_tokens": 13892829.0, + "step": 366 + }, + { + "epoch": 0.04668617224271721, + "ewc_loss": 7.264316082000732e-07, + "grad_norm": 3.118638038635254, + "learning_rate": 1.5515048749470113e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.8243770599365234, + "num_tokens": 13936045.0, + "step": 367 + }, + { + "epoch": 0.04681338252130772, + "ewc_loss": 7.227063179016113e-07, + "grad_norm": 3.9190316200256348, + "learning_rate": 1.55574395930479e-07, + "loss": 0.6131, + "mean_token_accuracy": 0.8104079961776733, + "num_tokens": 13977259.0, + "step": 368 + }, + { + "epoch": 0.046940592799898234, + "ewc_loss": 7.227063179016113e-07, + "grad_norm": 4.160822868347168, + "learning_rate": 1.5599830436625687e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.832868218421936, + "num_tokens": 14008670.0, + "step": 369 + }, + { + "epoch": 0.04706780307848874, + "ewc_loss": 7.338821887969971e-07, + "grad_norm": 4.0409698486328125, + "learning_rate": 1.5642221280203474e-07, + "loss": 0.569, + "mean_token_accuracy": 0.8248876929283142, + "num_tokens": 14041634.0, + "step": 370 + }, + { + "epoch": 0.04719501335707925, + "ewc_loss": 7.338821887969971e-07, + "grad_norm": 4.264252662658691, + "learning_rate": 1.5684612123781262e-07, + "loss": 0.5764, + "mean_token_accuracy": 0.8204932808876038, + "num_tokens": 14076944.0, + "step": 371 + }, + { + "epoch": 0.04732222363566976, + "ewc_loss": 7.37607479095459e-07, + "grad_norm": 3.8847386837005615, + "learning_rate": 1.572700296735905e-07, + "loss": 0.64, + "mean_token_accuracy": 0.8023068308830261, + "num_tokens": 14109665.0, + "step": 372 + }, + { + "epoch": 0.04744943391426027, + "ewc_loss": 7.450580596923828e-07, + "grad_norm": 3.1539766788482666, + "learning_rate": 1.576939381093684e-07, + "loss": 0.5765, + "mean_token_accuracy": 0.8195260763168335, + "num_tokens": 14154642.0, + "step": 373 + }, + { + "epoch": 0.047576644192850785, + "ewc_loss": 7.413327693939209e-07, + "grad_norm": 3.517975330352783, + "learning_rate": 1.5811784654514623e-07, + "loss": 0.6361, + "mean_token_accuracy": 0.8072751760482788, + "num_tokens": 14194043.0, + "step": 374 + }, + { + "epoch": 0.04770385447144129, + "ewc_loss": 7.562339305877686e-07, + "grad_norm": 3.905153274536133, + "learning_rate": 1.5854175498092413e-07, + "loss": 0.6052, + "mean_token_accuracy": 0.8117572665214539, + "num_tokens": 14230821.0, + "step": 375 + }, + { + "epoch": 0.0478310647500318, + "ewc_loss": 7.674098014831543e-07, + "grad_norm": 4.152787208557129, + "learning_rate": 1.5896566341670198e-07, + "loss": 0.6211, + "mean_token_accuracy": 0.8070400357246399, + "num_tokens": 14261791.0, + "step": 376 + }, + { + "epoch": 0.047958275028622314, + "ewc_loss": 7.636845111846924e-07, + "grad_norm": 4.53727388381958, + "learning_rate": 1.5938957185247988e-07, + "loss": 0.6357, + "mean_token_accuracy": 0.8053901791572571, + "num_tokens": 14295778.0, + "step": 377 + }, + { + "epoch": 0.04808548530721282, + "ewc_loss": 7.674098014831543e-07, + "grad_norm": 3.2112648487091064, + "learning_rate": 1.5981348028825772e-07, + "loss": 0.6203, + "mean_token_accuracy": 0.8030776381492615, + "num_tokens": 14335381.0, + "step": 378 + }, + { + "epoch": 0.048212695585803336, + "ewc_loss": 7.674098014831543e-07, + "grad_norm": 3.6239185333251953, + "learning_rate": 1.6023738872403562e-07, + "loss": 0.6012, + "mean_token_accuracy": 0.810909628868103, + "num_tokens": 14368939.0, + "step": 379 + }, + { + "epoch": 0.048339905864393844, + "ewc_loss": 7.674098014831543e-07, + "grad_norm": 3.574575901031494, + "learning_rate": 1.6066129715981347e-07, + "loss": 0.5929, + "mean_token_accuracy": 0.81232750415802, + "num_tokens": 14401725.0, + "step": 380 + }, + { + "epoch": 0.04846711614298435, + "ewc_loss": 7.674098014831543e-07, + "grad_norm": 3.63779878616333, + "learning_rate": 1.6108520559559137e-07, + "loss": 0.5857, + "mean_token_accuracy": 0.8143301010131836, + "num_tokens": 14446175.0, + "step": 381 + }, + { + "epoch": 0.048594326421574865, + "ewc_loss": 7.674098014831543e-07, + "grad_norm": 4.910614013671875, + "learning_rate": 1.6150911403136921e-07, + "loss": 0.612, + "mean_token_accuracy": 0.8095716238021851, + "num_tokens": 14481681.0, + "step": 382 + }, + { + "epoch": 0.04872153670016537, + "ewc_loss": 7.748603820800781e-07, + "grad_norm": 4.860683917999268, + "learning_rate": 1.619330224671471e-07, + "loss": 0.5745, + "mean_token_accuracy": 0.8200065493583679, + "num_tokens": 14518201.0, + "step": 383 + }, + { + "epoch": 0.04884874697875588, + "ewc_loss": 7.748603820800781e-07, + "grad_norm": 3.4087984561920166, + "learning_rate": 1.6235693090292496e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8184984922409058, + "num_tokens": 14558445.0, + "step": 384 + }, + { + "epoch": 0.048975957257346395, + "ewc_loss": 7.82310962677002e-07, + "grad_norm": 4.4547343254089355, + "learning_rate": 1.6278083933870286e-07, + "loss": 0.5946, + "mean_token_accuracy": 0.8139301538467407, + "num_tokens": 14593434.0, + "step": 385 + }, + { + "epoch": 0.0491031675359369, + "ewc_loss": 7.82310962677002e-07, + "grad_norm": 5.569748878479004, + "learning_rate": 1.632047477744807e-07, + "loss": 0.544, + "mean_token_accuracy": 0.8324764966964722, + "num_tokens": 14629327.0, + "step": 386 + }, + { + "epoch": 0.049230377814527417, + "ewc_loss": 7.897615432739258e-07, + "grad_norm": 5.994439601898193, + "learning_rate": 1.6362865621025858e-07, + "loss": 0.6063, + "mean_token_accuracy": 0.8181325793266296, + "num_tokens": 14670233.0, + "step": 387 + }, + { + "epoch": 0.049357588093117924, + "ewc_loss": 7.934868335723877e-07, + "grad_norm": 5.61524772644043, + "learning_rate": 1.6405256464603645e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.829524576663971, + "num_tokens": 14704517.0, + "step": 388 + }, + { + "epoch": 0.04948479837170843, + "ewc_loss": 7.972121238708496e-07, + "grad_norm": 2.9737493991851807, + "learning_rate": 1.6447647308181432e-07, + "loss": 0.5941, + "mean_token_accuracy": 0.816117525100708, + "num_tokens": 14745061.0, + "step": 389 + }, + { + "epoch": 0.049612008650298946, + "ewc_loss": 7.972121238708496e-07, + "grad_norm": 3.4277989864349365, + "learning_rate": 1.649003815175922e-07, + "loss": 0.6053, + "mean_token_accuracy": 0.8142637610435486, + "num_tokens": 14787453.0, + "step": 390 + }, + { + "epoch": 0.04973921892888945, + "ewc_loss": 8.009374141693115e-07, + "grad_norm": 3.5471017360687256, + "learning_rate": 1.6532428995337007e-07, + "loss": 0.5871, + "mean_token_accuracy": 0.8192212581634521, + "num_tokens": 14832212.0, + "step": 391 + }, + { + "epoch": 0.04986642920747997, + "ewc_loss": 8.009374141693115e-07, + "grad_norm": 3.8106486797332764, + "learning_rate": 1.6574819838914794e-07, + "loss": 0.5707, + "mean_token_accuracy": 0.824248194694519, + "num_tokens": 14872469.0, + "step": 392 + }, + { + "epoch": 0.049993639486070475, + "ewc_loss": 8.083879947662354e-07, + "grad_norm": 3.677051067352295, + "learning_rate": 1.661721068249258e-07, + "loss": 0.5893, + "mean_token_accuracy": 0.8186178207397461, + "num_tokens": 14907358.0, + "step": 393 + }, + { + "epoch": 0.05012084976466098, + "ewc_loss": 8.083879947662354e-07, + "grad_norm": 3.73248553276062, + "learning_rate": 1.6659601526070368e-07, + "loss": 0.6049, + "mean_token_accuracy": 0.8118725419044495, + "num_tokens": 14941759.0, + "step": 394 + }, + { + "epoch": 0.0502480600432515, + "ewc_loss": 8.121132850646973e-07, + "grad_norm": 3.2619831562042236, + "learning_rate": 1.6701992369648156e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.821471095085144, + "num_tokens": 14979802.0, + "step": 395 + }, + { + "epoch": 0.050375270321842004, + "ewc_loss": 8.083879947662354e-07, + "grad_norm": 3.3716864585876465, + "learning_rate": 1.6744383213225943e-07, + "loss": 0.6267, + "mean_token_accuracy": 0.8102551698684692, + "num_tokens": 15018471.0, + "step": 396 + }, + { + "epoch": 0.05050248060043251, + "ewc_loss": 8.083879947662354e-07, + "grad_norm": 3.556145429611206, + "learning_rate": 1.678677405680373e-07, + "loss": 0.6197, + "mean_token_accuracy": 0.8067690134048462, + "num_tokens": 15055124.0, + "step": 397 + }, + { + "epoch": 0.050629690879023026, + "ewc_loss": 8.158385753631592e-07, + "grad_norm": 3.2086918354034424, + "learning_rate": 1.6829164900381518e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.8277912139892578, + "num_tokens": 15095774.0, + "step": 398 + }, + { + "epoch": 0.05075690115761353, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 3.72983980178833, + "learning_rate": 1.6871555743959305e-07, + "loss": 0.6284, + "mean_token_accuracy": 0.8061993718147278, + "num_tokens": 15135526.0, + "step": 399 + }, + { + "epoch": 0.05088411143620405, + "ewc_loss": 8.23289155960083e-07, + "grad_norm": 3.1607372760772705, + "learning_rate": 1.6913946587537092e-07, + "loss": 0.5896, + "mean_token_accuracy": 0.8171243667602539, + "num_tokens": 15173234.0, + "step": 400 + }, + { + "epoch": 0.051011321714794555, + "ewc_loss": 8.270144462585449e-07, + "grad_norm": 3.4530537128448486, + "learning_rate": 1.695633743111488e-07, + "loss": 0.6227, + "mean_token_accuracy": 0.8063904643058777, + "num_tokens": 15212106.0, + "step": 401 + }, + { + "epoch": 0.05113853199338506, + "ewc_loss": 8.344650268554688e-07, + "grad_norm": 2.879967451095581, + "learning_rate": 1.6998728274692667e-07, + "loss": 0.6259, + "mean_token_accuracy": 0.8091877102851868, + "num_tokens": 15251089.0, + "step": 402 + }, + { + "epoch": 0.05126574227197558, + "ewc_loss": 8.307397365570068e-07, + "grad_norm": 3.164386034011841, + "learning_rate": 1.7041119118270454e-07, + "loss": 0.56, + "mean_token_accuracy": 0.8222301006317139, + "num_tokens": 15285022.0, + "step": 403 + }, + { + "epoch": 0.051392952550566084, + "ewc_loss": 8.381903171539307e-07, + "grad_norm": 2.875093460083008, + "learning_rate": 1.7083509961848238e-07, + "loss": 0.5999, + "mean_token_accuracy": 0.822534441947937, + "num_tokens": 15325890.0, + "step": 404 + }, + { + "epoch": 0.0515201628291566, + "ewc_loss": 8.419156074523926e-07, + "grad_norm": 2.963108539581299, + "learning_rate": 1.7125900805426028e-07, + "loss": 0.6212, + "mean_token_accuracy": 0.8054509162902832, + "num_tokens": 15365035.0, + "step": 405 + }, + { + "epoch": 0.051647373107747106, + "ewc_loss": 8.381903171539307e-07, + "grad_norm": 3.6241214275360107, + "learning_rate": 1.7168291649003813e-07, + "loss": 0.6346, + "mean_token_accuracy": 0.8058156371116638, + "num_tokens": 15402494.0, + "step": 406 + }, + { + "epoch": 0.051774583386337614, + "ewc_loss": 8.344650268554688e-07, + "grad_norm": 3.3952062129974365, + "learning_rate": 1.7210682492581603e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8409150242805481, + "num_tokens": 15439344.0, + "step": 407 + }, + { + "epoch": 0.05190179366492813, + "ewc_loss": 8.419156074523926e-07, + "grad_norm": 3.4002151489257812, + "learning_rate": 1.7253073336159387e-07, + "loss": 0.6036, + "mean_token_accuracy": 0.8106805086135864, + "num_tokens": 15476601.0, + "step": 408 + }, + { + "epoch": 0.052029003943518635, + "ewc_loss": 8.456408977508545e-07, + "grad_norm": 3.4106955528259277, + "learning_rate": 1.7295464179737177e-07, + "loss": 0.5469, + "mean_token_accuracy": 0.8284087181091309, + "num_tokens": 15511266.0, + "step": 409 + }, + { + "epoch": 0.05215621422210915, + "ewc_loss": 8.530914783477783e-07, + "grad_norm": 2.8412132263183594, + "learning_rate": 1.7337855023314962e-07, + "loss": 0.5549, + "mean_token_accuracy": 0.8262863755226135, + "num_tokens": 15550499.0, + "step": 410 + }, + { + "epoch": 0.05228342450069966, + "ewc_loss": 8.530914783477783e-07, + "grad_norm": 3.103003740310669, + "learning_rate": 1.7380245866892752e-07, + "loss": 0.5573, + "mean_token_accuracy": 0.8281168937683105, + "num_tokens": 15581563.0, + "step": 411 + }, + { + "epoch": 0.052410634779290165, + "ewc_loss": 8.568167686462402e-07, + "grad_norm": 3.84724760055542, + "learning_rate": 1.7422636710470536e-07, + "loss": 0.5633, + "mean_token_accuracy": 0.8259454965591431, + "num_tokens": 15615210.0, + "step": 412 + }, + { + "epoch": 0.05253784505788068, + "ewc_loss": 8.605420589447021e-07, + "grad_norm": 3.5558009147644043, + "learning_rate": 1.7465027554048326e-07, + "loss": 0.5433, + "mean_token_accuracy": 0.8273679614067078, + "num_tokens": 15654592.0, + "step": 413 + }, + { + "epoch": 0.05266505533647119, + "ewc_loss": 8.605420589447021e-07, + "grad_norm": 3.1038076877593994, + "learning_rate": 1.750741839762611e-07, + "loss": 0.5244, + "mean_token_accuracy": 0.8326007127761841, + "num_tokens": 15691853.0, + "step": 414 + }, + { + "epoch": 0.052792265615061694, + "ewc_loss": 8.605420589447021e-07, + "grad_norm": 3.8972811698913574, + "learning_rate": 1.75498092412039e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.8240953683853149, + "num_tokens": 15729559.0, + "step": 415 + }, + { + "epoch": 0.05291947589365221, + "ewc_loss": 8.67992639541626e-07, + "grad_norm": 4.092374801635742, + "learning_rate": 1.7592200084781686e-07, + "loss": 0.6222, + "mean_token_accuracy": 0.8033758997917175, + "num_tokens": 15759121.0, + "step": 416 + }, + { + "epoch": 0.053046686172242716, + "ewc_loss": 8.67992639541626e-07, + "grad_norm": 3.000399112701416, + "learning_rate": 1.7634590928359475e-07, + "loss": 0.5615, + "mean_token_accuracy": 0.8276574611663818, + "num_tokens": 15801636.0, + "step": 417 + }, + { + "epoch": 0.05317389645083323, + "ewc_loss": 8.791685104370117e-07, + "grad_norm": 3.8450024127960205, + "learning_rate": 1.767698177193726e-07, + "loss": 0.5833, + "mean_token_accuracy": 0.8209682703018188, + "num_tokens": 15837444.0, + "step": 418 + }, + { + "epoch": 0.05330110672942374, + "ewc_loss": 8.754432201385498e-07, + "grad_norm": 3.034790277481079, + "learning_rate": 1.771937261551505e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8328878879547119, + "num_tokens": 15879376.0, + "step": 419 + }, + { + "epoch": 0.053428317008014245, + "ewc_loss": 8.754432201385498e-07, + "grad_norm": 3.5520029067993164, + "learning_rate": 1.7761763459092835e-07, + "loss": 0.6756, + "mean_token_accuracy": 0.7923411130905151, + "num_tokens": 15922382.0, + "step": 420 + }, + { + "epoch": 0.05355552728660476, + "ewc_loss": 8.791685104370117e-07, + "grad_norm": 3.2718260288238525, + "learning_rate": 1.7804154302670624e-07, + "loss": 0.6044, + "mean_token_accuracy": 0.8138763308525085, + "num_tokens": 15958592.0, + "step": 421 + }, + { + "epoch": 0.05368273756519527, + "ewc_loss": 8.828938007354736e-07, + "grad_norm": 2.867429256439209, + "learning_rate": 1.784654514624841e-07, + "loss": 0.529, + "mean_token_accuracy": 0.83222496509552, + "num_tokens": 15993186.0, + "step": 422 + }, + { + "epoch": 0.05380994784378578, + "ewc_loss": 8.866190910339355e-07, + "grad_norm": 3.0067977905273438, + "learning_rate": 1.7888935989826196e-07, + "loss": 0.6078, + "mean_token_accuracy": 0.8118821978569031, + "num_tokens": 16029210.0, + "step": 423 + }, + { + "epoch": 0.05393715812237629, + "ewc_loss": 8.903443813323975e-07, + "grad_norm": 3.343575954437256, + "learning_rate": 1.7931326833403984e-07, + "loss": 0.5887, + "mean_token_accuracy": 0.8159116506576538, + "num_tokens": 16068031.0, + "step": 424 + }, + { + "epoch": 0.054064368400966796, + "ewc_loss": 8.903443813323975e-07, + "grad_norm": 2.6710946559906006, + "learning_rate": 1.797371767698177e-07, + "loss": 0.6055, + "mean_token_accuracy": 0.8125061988830566, + "num_tokens": 16109232.0, + "step": 425 + }, + { + "epoch": 0.05419157867955731, + "ewc_loss": 8.940696716308594e-07, + "grad_norm": 2.920994997024536, + "learning_rate": 1.8016108520559558e-07, + "loss": 0.537, + "mean_token_accuracy": 0.83107590675354, + "num_tokens": 16154189.0, + "step": 426 + }, + { + "epoch": 0.05431878895814782, + "ewc_loss": 9.015202522277832e-07, + "grad_norm": 2.6712429523468018, + "learning_rate": 1.8058499364137345e-07, + "loss": 0.5934, + "mean_token_accuracy": 0.8204241991043091, + "num_tokens": 16196717.0, + "step": 427 + }, + { + "epoch": 0.054445999236738325, + "ewc_loss": 9.052455425262451e-07, + "grad_norm": 2.970787525177002, + "learning_rate": 1.8100890207715133e-07, + "loss": 0.5586, + "mean_token_accuracy": 0.8248772621154785, + "num_tokens": 16238401.0, + "step": 428 + }, + { + "epoch": 0.05457320951532884, + "ewc_loss": 9.08970832824707e-07, + "grad_norm": 2.9037489891052246, + "learning_rate": 1.814328105129292e-07, + "loss": 0.6262, + "mean_token_accuracy": 0.8057089447975159, + "num_tokens": 16283766.0, + "step": 429 + }, + { + "epoch": 0.05470041979391935, + "ewc_loss": 9.164214134216309e-07, + "grad_norm": 3.6885123252868652, + "learning_rate": 1.8185671894870707e-07, + "loss": 0.6368, + "mean_token_accuracy": 0.8048459887504578, + "num_tokens": 16325655.0, + "step": 430 + }, + { + "epoch": 0.05482763007250986, + "ewc_loss": 9.238719940185547e-07, + "grad_norm": 3.1714189052581787, + "learning_rate": 1.8228062738448494e-07, + "loss": 0.5864, + "mean_token_accuracy": 0.8219386339187622, + "num_tokens": 16366306.0, + "step": 431 + }, + { + "epoch": 0.05495484035110037, + "ewc_loss": 9.238719940185547e-07, + "grad_norm": 3.1967597007751465, + "learning_rate": 1.8270453582026282e-07, + "loss": 0.5689, + "mean_token_accuracy": 0.8206506967544556, + "num_tokens": 16400964.0, + "step": 432 + }, + { + "epoch": 0.055082050629690876, + "ewc_loss": 9.238719940185547e-07, + "grad_norm": 2.845585584640503, + "learning_rate": 1.831284442560407e-07, + "loss": 0.5665, + "mean_token_accuracy": 0.8230022192001343, + "num_tokens": 16440022.0, + "step": 433 + }, + { + "epoch": 0.05520926090828139, + "ewc_loss": 9.275972843170166e-07, + "grad_norm": 2.563418388366699, + "learning_rate": 1.8355235269181856e-07, + "loss": 0.5651, + "mean_token_accuracy": 0.8211695551872253, + "num_tokens": 16478233.0, + "step": 434 + }, + { + "epoch": 0.0553364711868719, + "ewc_loss": 9.387731552124023e-07, + "grad_norm": 2.549583911895752, + "learning_rate": 1.8397626112759643e-07, + "loss": 0.5428, + "mean_token_accuracy": 0.8272765874862671, + "num_tokens": 16514879.0, + "step": 435 + }, + { + "epoch": 0.05546368146546241, + "ewc_loss": 9.387731552124023e-07, + "grad_norm": 3.052494525909424, + "learning_rate": 1.844001695633743e-07, + "loss": 0.5908, + "mean_token_accuracy": 0.8155047297477722, + "num_tokens": 16553959.0, + "step": 436 + }, + { + "epoch": 0.05559089174405292, + "ewc_loss": 9.462237358093262e-07, + "grad_norm": 2.443458318710327, + "learning_rate": 1.8482407799915218e-07, + "loss": 0.5421, + "mean_token_accuracy": 0.830105721950531, + "num_tokens": 16598683.0, + "step": 437 + }, + { + "epoch": 0.05571810202264343, + "ewc_loss": 9.611248970031738e-07, + "grad_norm": 2.892017364501953, + "learning_rate": 1.8524798643493005e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8360521793365479, + "num_tokens": 16634224.0, + "step": 438 + }, + { + "epoch": 0.05584531230123394, + "ewc_loss": 9.685754776000977e-07, + "grad_norm": 2.778721809387207, + "learning_rate": 1.8567189487070792e-07, + "loss": 0.6246, + "mean_token_accuracy": 0.8050546646118164, + "num_tokens": 16675660.0, + "step": 439 + }, + { + "epoch": 0.05597252257982445, + "ewc_loss": 9.685754776000977e-07, + "grad_norm": 2.4354913234710693, + "learning_rate": 1.8609580330648577e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8406695127487183, + "num_tokens": 16719203.0, + "step": 440 + }, + { + "epoch": 0.05609973285841496, + "ewc_loss": 9.685754776000977e-07, + "grad_norm": 2.553032159805298, + "learning_rate": 1.8651971174226367e-07, + "loss": 0.6246, + "mean_token_accuracy": 0.8046380281448364, + "num_tokens": 16762304.0, + "step": 441 + }, + { + "epoch": 0.05622694313700547, + "ewc_loss": 9.685754776000977e-07, + "grad_norm": 2.8279311656951904, + "learning_rate": 1.8694362017804152e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8381890654563904, + "num_tokens": 16801883.0, + "step": 442 + }, + { + "epoch": 0.05635415341559598, + "ewc_loss": 9.760260581970215e-07, + "grad_norm": 3.040543556213379, + "learning_rate": 1.8736752861381941e-07, + "loss": 0.6751, + "mean_token_accuracy": 0.7956392168998718, + "num_tokens": 16837745.0, + "step": 443 + }, + { + "epoch": 0.05648136369418649, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 2.7543084621429443, + "learning_rate": 1.8779143704959726e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8190680146217346, + "num_tokens": 16883476.0, + "step": 444 + }, + { + "epoch": 0.056608573972777, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 2.691244602203369, + "learning_rate": 1.8821534548537516e-07, + "loss": 0.5393, + "mean_token_accuracy": 0.8301074504852295, + "num_tokens": 16927870.0, + "step": 445 + }, + { + "epoch": 0.05673578425136751, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 2.659325122833252, + "learning_rate": 1.88639253921153e-07, + "loss": 0.531, + "mean_token_accuracy": 0.8317388296127319, + "num_tokens": 16966498.0, + "step": 446 + }, + { + "epoch": 0.05686299452995802, + "ewc_loss": 9.834766387939453e-07, + "grad_norm": 3.0070416927337646, + "learning_rate": 1.890631623569309e-07, + "loss": 0.5875, + "mean_token_accuracy": 0.814478874206543, + "num_tokens": 17005706.0, + "step": 447 + }, + { + "epoch": 0.05699020480854853, + "ewc_loss": 9.909272193908691e-07, + "grad_norm": 2.3912365436553955, + "learning_rate": 1.8948707079270875e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8368863463401794, + "num_tokens": 17045960.0, + "step": 448 + }, + { + "epoch": 0.057117415087139044, + "ewc_loss": 9.909272193908691e-07, + "grad_norm": 2.7557129859924316, + "learning_rate": 1.8991097922848665e-07, + "loss": 0.5931, + "mean_token_accuracy": 0.8134575486183167, + "num_tokens": 17085731.0, + "step": 449 + }, + { + "epoch": 0.05724462536572955, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.586320638656616, + "learning_rate": 1.903348876642645e-07, + "loss": 0.5915, + "mean_token_accuracy": 0.8135047554969788, + "num_tokens": 17122345.0, + "step": 450 + }, + { + "epoch": 0.05737183564432006, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.7842214107513428, + "learning_rate": 1.907587961000424e-07, + "loss": 0.5487, + "mean_token_accuracy": 0.8264208436012268, + "num_tokens": 17162330.0, + "step": 451 + }, + { + "epoch": 0.05749904592291057, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.208603858947754, + "learning_rate": 1.9118270453582024e-07, + "loss": 0.6157, + "mean_token_accuracy": 0.8110939264297485, + "num_tokens": 17202389.0, + "step": 452 + }, + { + "epoch": 0.05762625620150108, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.7385013103485107, + "learning_rate": 1.9160661297159814e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.841077983379364, + "num_tokens": 17234065.0, + "step": 453 + }, + { + "epoch": 0.057753466480091595, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.865166664123535, + "learning_rate": 1.9203052140737599e-07, + "loss": 0.5892, + "mean_token_accuracy": 0.8184902667999268, + "num_tokens": 17272741.0, + "step": 454 + }, + { + "epoch": 0.0578806767586821, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.4287850856781006, + "learning_rate": 1.9245442984315389e-07, + "loss": 0.5687, + "mean_token_accuracy": 0.8187164068222046, + "num_tokens": 17302606.0, + "step": 455 + }, + { + "epoch": 0.05800788703727261, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.7206337451934814, + "learning_rate": 1.9287833827893173e-07, + "loss": 0.552, + "mean_token_accuracy": 0.8285186290740967, + "num_tokens": 17342740.0, + "step": 456 + }, + { + "epoch": 0.058135097315863124, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.9990792274475098, + "learning_rate": 1.9330224671470963e-07, + "loss": 0.5698, + "mean_token_accuracy": 0.8207796812057495, + "num_tokens": 17386660.0, + "step": 457 + }, + { + "epoch": 0.05826230759445363, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.778637409210205, + "learning_rate": 1.9372615515048748e-07, + "loss": 0.562, + "mean_token_accuracy": 0.8207105994224548, + "num_tokens": 17428339.0, + "step": 458 + }, + { + "epoch": 0.05838951787304414, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 3.0345022678375244, + "learning_rate": 1.9415006358626535e-07, + "loss": 0.5285, + "mean_token_accuracy": 0.8307387232780457, + "num_tokens": 17465164.0, + "step": 459 + }, + { + "epoch": 0.05851672815163465, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.990299940109253, + "learning_rate": 1.9457397202204322e-07, + "loss": 0.6191, + "mean_token_accuracy": 0.8061854839324951, + "num_tokens": 17505233.0, + "step": 460 + }, + { + "epoch": 0.05864393843022516, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.5347883701324463, + "learning_rate": 1.949978804578211e-07, + "loss": 0.5973, + "mean_token_accuracy": 0.8165433406829834, + "num_tokens": 17544386.0, + "step": 461 + }, + { + "epoch": 0.058771148708815675, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.9339988231658936, + "learning_rate": 1.9542178889359897e-07, + "loss": 0.5571, + "mean_token_accuracy": 0.8253620862960815, + "num_tokens": 17583917.0, + "step": 462 + }, + { + "epoch": 0.05889835898740618, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.7208988666534424, + "learning_rate": 1.9584569732937684e-07, + "loss": 0.5574, + "mean_token_accuracy": 0.8251532316207886, + "num_tokens": 17623308.0, + "step": 463 + }, + { + "epoch": 0.05902556926599669, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.933227777481079, + "learning_rate": 1.962696057651547e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.8175874948501587, + "num_tokens": 17663449.0, + "step": 464 + }, + { + "epoch": 0.059152779544587204, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.9436209201812744, + "learning_rate": 1.9669351420093258e-07, + "loss": 0.5706, + "mean_token_accuracy": 0.8218303918838501, + "num_tokens": 17698778.0, + "step": 465 + }, + { + "epoch": 0.05927998982317771, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.746584892272949, + "learning_rate": 1.9711742263671046e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8390100598335266, + "num_tokens": 17736003.0, + "step": 466 + }, + { + "epoch": 0.059407200101768226, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.5113441944122314, + "learning_rate": 1.9754133107248833e-07, + "loss": 0.6053, + "mean_token_accuracy": 0.8101658225059509, + "num_tokens": 17776586.0, + "step": 467 + }, + { + "epoch": 0.059534410380358734, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.78296160697937, + "learning_rate": 1.979652395082662e-07, + "loss": 0.5584, + "mean_token_accuracy": 0.8213281631469727, + "num_tokens": 17815132.0, + "step": 468 + }, + { + "epoch": 0.05966162065894924, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.30157470703125, + "learning_rate": 1.9838914794404408e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8361403942108154, + "num_tokens": 17855130.0, + "step": 469 + }, + { + "epoch": 0.059788830937539755, + "ewc_loss": 1.0058283805847168e-06, + "grad_norm": 2.8454034328460693, + "learning_rate": 1.9881305637982195e-07, + "loss": 0.5506, + "mean_token_accuracy": 0.8263806700706482, + "num_tokens": 17892337.0, + "step": 470 + }, + { + "epoch": 0.05991604121613026, + "ewc_loss": 1.0132789611816406e-06, + "grad_norm": 2.5760154724121094, + "learning_rate": 1.9923696481559982e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8239328265190125, + "num_tokens": 17930291.0, + "step": 471 + }, + { + "epoch": 0.06004325149472077, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 2.8788869380950928, + "learning_rate": 1.996608732513777e-07, + "loss": 0.5872, + "mean_token_accuracy": 0.8249742388725281, + "num_tokens": 17965246.0, + "step": 472 + }, + { + "epoch": 0.060170461773311285, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 2.810960054397583, + "learning_rate": 2.0008478168715557e-07, + "loss": 0.5618, + "mean_token_accuracy": 0.8213878273963928, + "num_tokens": 18000503.0, + "step": 473 + }, + { + "epoch": 0.06029767205190179, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 2.21231746673584, + "learning_rate": 2.0050869012293344e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8289476037025452, + "num_tokens": 18043225.0, + "step": 474 + }, + { + "epoch": 0.060424882330492306, + "ewc_loss": 1.0207295417785645e-06, + "grad_norm": 2.962578296661377, + "learning_rate": 2.009325985587113e-07, + "loss": 0.5712, + "mean_token_accuracy": 0.8202738761901855, + "num_tokens": 18077481.0, + "step": 475 + }, + { + "epoch": 0.060552092609082814, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.3755602836608887, + "learning_rate": 2.0135650699448918e-07, + "loss": 0.5106, + "mean_token_accuracy": 0.8385330438613892, + "num_tokens": 18114172.0, + "step": 476 + }, + { + "epoch": 0.06067930288767332, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.7852368354797363, + "learning_rate": 2.0178041543026706e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8406382203102112, + "num_tokens": 18154108.0, + "step": 477 + }, + { + "epoch": 0.060806513166263836, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.4602646827697754, + "learning_rate": 2.022043238660449e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8230665326118469, + "num_tokens": 18194573.0, + "step": 478 + }, + { + "epoch": 0.06093372344485434, + "ewc_loss": 1.0356307029724121e-06, + "grad_norm": 2.733576536178589, + "learning_rate": 2.026282323018228e-07, + "loss": 0.4955, + "mean_token_accuracy": 0.842689037322998, + "num_tokens": 18229092.0, + "step": 479 + }, + { + "epoch": 0.06106093372344486, + "ewc_loss": 1.043081283569336e-06, + "grad_norm": 2.618346691131592, + "learning_rate": 2.0305214073760065e-07, + "loss": 0.5734, + "mean_token_accuracy": 0.8146746158599854, + "num_tokens": 18264065.0, + "step": 480 + }, + { + "epoch": 0.061188144002035365, + "ewc_loss": 1.0505318641662598e-06, + "grad_norm": 2.001677989959717, + "learning_rate": 2.0347604917337855e-07, + "loss": 0.5557, + "mean_token_accuracy": 0.8253673315048218, + "num_tokens": 18306141.0, + "step": 481 + }, + { + "epoch": 0.06131535428062587, + "ewc_loss": 1.0505318641662598e-06, + "grad_norm": 2.4881439208984375, + "learning_rate": 2.038999576091564e-07, + "loss": 0.5987, + "mean_token_accuracy": 0.8157598972320557, + "num_tokens": 18345864.0, + "step": 482 + }, + { + "epoch": 0.06144256455921639, + "ewc_loss": 1.0579824447631836e-06, + "grad_norm": 2.4801342487335205, + "learning_rate": 2.043238660449343e-07, + "loss": 0.5758, + "mean_token_accuracy": 0.8191951513290405, + "num_tokens": 18381456.0, + "step": 483 + }, + { + "epoch": 0.061569774837806894, + "ewc_loss": 1.0579824447631836e-06, + "grad_norm": 2.7577075958251953, + "learning_rate": 2.0474777448071214e-07, + "loss": 0.5503, + "mean_token_accuracy": 0.8287504315376282, + "num_tokens": 18421688.0, + "step": 484 + }, + { + "epoch": 0.0616969851163974, + "ewc_loss": 1.0654330253601074e-06, + "grad_norm": 2.5895309448242188, + "learning_rate": 2.0517168291649004e-07, + "loss": 0.6107, + "mean_token_accuracy": 0.8054868578910828, + "num_tokens": 18463594.0, + "step": 485 + }, + { + "epoch": 0.061824195394987916, + "ewc_loss": 1.0654330253601074e-06, + "grad_norm": 2.479569673538208, + "learning_rate": 2.0559559135226788e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8219601511955261, + "num_tokens": 18497583.0, + "step": 486 + }, + { + "epoch": 0.06195140567357842, + "ewc_loss": 1.0654330253601074e-06, + "grad_norm": 2.3103814125061035, + "learning_rate": 2.0601949978804578e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8389129638671875, + "num_tokens": 18533373.0, + "step": 487 + }, + { + "epoch": 0.06207861595216894, + "ewc_loss": 1.0728836059570312e-06, + "grad_norm": 2.249934434890747, + "learning_rate": 2.0644340822382363e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8354107141494751, + "num_tokens": 18574509.0, + "step": 488 + }, + { + "epoch": 0.062205826230759445, + "ewc_loss": 1.0728836059570312e-06, + "grad_norm": 2.702897548675537, + "learning_rate": 2.0686731665960153e-07, + "loss": 0.5857, + "mean_token_accuracy": 0.8160887360572815, + "num_tokens": 18613165.0, + "step": 489 + }, + { + "epoch": 0.06233303650934995, + "ewc_loss": 1.087784767150879e-06, + "grad_norm": 3.2254526615142822, + "learning_rate": 2.0729122509537937e-07, + "loss": 0.5399, + "mean_token_accuracy": 0.8307256698608398, + "num_tokens": 18647127.0, + "step": 490 + }, + { + "epoch": 0.06246024678794047, + "ewc_loss": 1.080334186553955e-06, + "grad_norm": 2.7177624702453613, + "learning_rate": 2.0771513353115727e-07, + "loss": 0.5848, + "mean_token_accuracy": 0.8151547312736511, + "num_tokens": 18686247.0, + "step": 491 + }, + { + "epoch": 0.06258745706653097, + "ewc_loss": 1.080334186553955e-06, + "grad_norm": 2.4327073097229004, + "learning_rate": 2.0813904196693512e-07, + "loss": 0.545, + "mean_token_accuracy": 0.8259400129318237, + "num_tokens": 18723156.0, + "step": 492 + }, + { + "epoch": 0.06271466734512149, + "ewc_loss": 1.0952353477478027e-06, + "grad_norm": 2.363934278488159, + "learning_rate": 2.0856295040271302e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.836294949054718, + "num_tokens": 18757189.0, + "step": 493 + }, + { + "epoch": 0.06284187762371199, + "ewc_loss": 1.0952353477478027e-06, + "grad_norm": 2.670579671859741, + "learning_rate": 2.0898685883849086e-07, + "loss": 0.5483, + "mean_token_accuracy": 0.8286738395690918, + "num_tokens": 18797937.0, + "step": 494 + }, + { + "epoch": 0.0629690879023025, + "ewc_loss": 1.1101365089416504e-06, + "grad_norm": 2.733034372329712, + "learning_rate": 2.0941076727426874e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8229436874389648, + "num_tokens": 18830862.0, + "step": 495 + }, + { + "epoch": 0.06309629818089302, + "ewc_loss": 1.1175870895385742e-06, + "grad_norm": 2.1837637424468994, + "learning_rate": 2.098346757100466e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.8387068510055542, + "num_tokens": 18872643.0, + "step": 496 + }, + { + "epoch": 0.06322350845948353, + "ewc_loss": 1.1175870895385742e-06, + "grad_norm": 2.526150941848755, + "learning_rate": 2.1025858414582448e-07, + "loss": 0.5672, + "mean_token_accuracy": 0.8209861516952515, + "num_tokens": 18910152.0, + "step": 497 + }, + { + "epoch": 0.06335071873807403, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 2.5711097717285156, + "learning_rate": 2.1068249258160238e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8150483965873718, + "num_tokens": 18949446.0, + "step": 498 + }, + { + "epoch": 0.06347792901666455, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 2.211441993713379, + "learning_rate": 2.1110640101738023e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8209969997406006, + "num_tokens": 18985865.0, + "step": 499 + }, + { + "epoch": 0.06360513929525506, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 2.2583065032958984, + "learning_rate": 2.1153030945315813e-07, + "loss": 0.5997, + "mean_token_accuracy": 0.8138610124588013, + "num_tokens": 19025670.0, + "step": 500 + }, + { + "epoch": 0.06373234957384556, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 2.7115185260772705, + "learning_rate": 2.1195421788893597e-07, + "loss": 0.553, + "mean_token_accuracy": 0.8266217708587646, + "num_tokens": 19059307.0, + "step": 501 + }, + { + "epoch": 0.06385955985243608, + "ewc_loss": 1.1473894119262695e-06, + "grad_norm": 2.4639077186584473, + "learning_rate": 2.1237812632471387e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.8332583904266357, + "num_tokens": 19090384.0, + "step": 502 + }, + { + "epoch": 0.06398677013102659, + "ewc_loss": 1.1548399925231934e-06, + "grad_norm": 2.439485788345337, + "learning_rate": 2.1280203476049172e-07, + "loss": 0.5438, + "mean_token_accuracy": 0.8306180238723755, + "num_tokens": 19121512.0, + "step": 503 + }, + { + "epoch": 0.06411398040961709, + "ewc_loss": 1.1548399925231934e-06, + "grad_norm": 2.3058383464813232, + "learning_rate": 2.1322594319626962e-07, + "loss": 0.5906, + "mean_token_accuracy": 0.8134896159172058, + "num_tokens": 19158522.0, + "step": 504 + }, + { + "epoch": 0.0642411906882076, + "ewc_loss": 1.1622905731201172e-06, + "grad_norm": 2.0789597034454346, + "learning_rate": 2.1364985163204746e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.84747314453125, + "num_tokens": 19197101.0, + "step": 505 + }, + { + "epoch": 0.06436840096679812, + "ewc_loss": 1.1622905731201172e-06, + "grad_norm": 2.4706108570098877, + "learning_rate": 2.1407376006782536e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8433237671852112, + "num_tokens": 19235611.0, + "step": 506 + }, + { + "epoch": 0.06449561124538863, + "ewc_loss": 1.169741153717041e-06, + "grad_norm": 2.894031286239624, + "learning_rate": 2.144976685036032e-07, + "loss": 0.5954, + "mean_token_accuracy": 0.8099503517150879, + "num_tokens": 19270904.0, + "step": 507 + }, + { + "epoch": 0.06462282152397913, + "ewc_loss": 1.169741153717041e-06, + "grad_norm": 2.2913012504577637, + "learning_rate": 2.149215769393811e-07, + "loss": 0.5195, + "mean_token_accuracy": 0.8352233171463013, + "num_tokens": 19302067.0, + "step": 508 + }, + { + "epoch": 0.06475003180256965, + "ewc_loss": 1.1771917343139648e-06, + "grad_norm": 2.067092180252075, + "learning_rate": 2.1534548537515895e-07, + "loss": 0.5425, + "mean_token_accuracy": 0.8277462124824524, + "num_tokens": 19339250.0, + "step": 509 + }, + { + "epoch": 0.06487724208116016, + "ewc_loss": 1.1846423149108887e-06, + "grad_norm": 2.4515604972839355, + "learning_rate": 2.1576939381093685e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8316489458084106, + "num_tokens": 19378828.0, + "step": 510 + }, + { + "epoch": 0.06500445235975066, + "ewc_loss": 1.1846423149108887e-06, + "grad_norm": 1.9930236339569092, + "learning_rate": 2.161933022467147e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.839510977268219, + "num_tokens": 19422994.0, + "step": 511 + }, + { + "epoch": 0.06513166263834118, + "ewc_loss": 1.1846423149108887e-06, + "grad_norm": 2.246614933013916, + "learning_rate": 2.166172106824926e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8370077013969421, + "num_tokens": 19459828.0, + "step": 512 + }, + { + "epoch": 0.06525887291693169, + "ewc_loss": 1.1995434761047363e-06, + "grad_norm": 2.0683705806732178, + "learning_rate": 2.1704111911827044e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.8376542329788208, + "num_tokens": 19502715.0, + "step": 513 + }, + { + "epoch": 0.0653860831955222, + "ewc_loss": 1.2069940567016602e-06, + "grad_norm": 2.222633123397827, + "learning_rate": 2.1746502755404831e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.836470365524292, + "num_tokens": 19545178.0, + "step": 514 + }, + { + "epoch": 0.06551329347411271, + "ewc_loss": 1.2069940567016602e-06, + "grad_norm": 2.050952196121216, + "learning_rate": 2.178889359898262e-07, + "loss": 0.6314, + "mean_token_accuracy": 0.8004540205001831, + "num_tokens": 19589352.0, + "step": 515 + }, + { + "epoch": 0.06564050375270322, + "ewc_loss": 1.214444637298584e-06, + "grad_norm": 2.2387242317199707, + "learning_rate": 2.1831284442560406e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8398244380950928, + "num_tokens": 19629697.0, + "step": 516 + }, + { + "epoch": 0.06576771403129372, + "ewc_loss": 1.2293457984924316e-06, + "grad_norm": 2.2532474994659424, + "learning_rate": 2.1873675286138193e-07, + "loss": 0.5704, + "mean_token_accuracy": 0.8222633600234985, + "num_tokens": 19665866.0, + "step": 517 + }, + { + "epoch": 0.06589492430988424, + "ewc_loss": 1.2293457984924316e-06, + "grad_norm": 2.7552602291107178, + "learning_rate": 2.191606612971598e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8195840120315552, + "num_tokens": 19706677.0, + "step": 518 + }, + { + "epoch": 0.06602213458847475, + "ewc_loss": 1.2293457984924316e-06, + "grad_norm": 2.3530499935150146, + "learning_rate": 2.1958456973293768e-07, + "loss": 0.5991, + "mean_token_accuracy": 0.8123931884765625, + "num_tokens": 19748850.0, + "step": 519 + }, + { + "epoch": 0.06614934486706527, + "ewc_loss": 1.2293457984924316e-06, + "grad_norm": 2.585303783416748, + "learning_rate": 2.2000847816871555e-07, + "loss": 0.5252, + "mean_token_accuracy": 0.8325706720352173, + "num_tokens": 19783832.0, + "step": 520 + }, + { + "epoch": 0.06627655514565577, + "ewc_loss": 1.2442469596862793e-06, + "grad_norm": 2.1008896827697754, + "learning_rate": 2.2043238660449342e-07, + "loss": 0.5286, + "mean_token_accuracy": 0.8305323123931885, + "num_tokens": 19824771.0, + "step": 521 + }, + { + "epoch": 0.06640376542424628, + "ewc_loss": 1.2367963790893555e-06, + "grad_norm": 2.5028562545776367, + "learning_rate": 2.208562950402713e-07, + "loss": 0.5526, + "mean_token_accuracy": 0.8250541687011719, + "num_tokens": 19860165.0, + "step": 522 + }, + { + "epoch": 0.0665309757028368, + "ewc_loss": 1.2516975402832031e-06, + "grad_norm": 2.0055949687957764, + "learning_rate": 2.2128020347604917e-07, + "loss": 0.5518, + "mean_token_accuracy": 0.8262273073196411, + "num_tokens": 19905863.0, + "step": 523 + }, + { + "epoch": 0.0666581859814273, + "ewc_loss": 1.2516975402832031e-06, + "grad_norm": 2.4356820583343506, + "learning_rate": 2.2170411191182704e-07, + "loss": 0.558, + "mean_token_accuracy": 0.8211111426353455, + "num_tokens": 19940222.0, + "step": 524 + }, + { + "epoch": 0.06678539626001781, + "ewc_loss": 1.259148120880127e-06, + "grad_norm": 2.1112723350524902, + "learning_rate": 2.221280203476049e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8298347592353821, + "num_tokens": 19980714.0, + "step": 525 + }, + { + "epoch": 0.06691260653860832, + "ewc_loss": 1.2665987014770508e-06, + "grad_norm": 2.1095290184020996, + "learning_rate": 2.2255192878338279e-07, + "loss": 0.5963, + "mean_token_accuracy": 0.8087994456291199, + "num_tokens": 20018190.0, + "step": 526 + }, + { + "epoch": 0.06703981681719882, + "ewc_loss": 1.2665987014770508e-06, + "grad_norm": 2.011979341506958, + "learning_rate": 2.2297583721916066e-07, + "loss": 0.6266, + "mean_token_accuracy": 0.8047539591789246, + "num_tokens": 20056993.0, + "step": 527 + }, + { + "epoch": 0.06716702709578934, + "ewc_loss": 1.2740492820739746e-06, + "grad_norm": 2.272181749343872, + "learning_rate": 2.2339974565493853e-07, + "loss": 0.5671, + "mean_token_accuracy": 0.8239889144897461, + "num_tokens": 20100324.0, + "step": 528 + }, + { + "epoch": 0.06729423737437985, + "ewc_loss": 1.2889504432678223e-06, + "grad_norm": 2.107339859008789, + "learning_rate": 2.238236540907164e-07, + "loss": 0.5638, + "mean_token_accuracy": 0.820601761341095, + "num_tokens": 20136416.0, + "step": 529 + }, + { + "epoch": 0.06742144765297035, + "ewc_loss": 1.2814998626708984e-06, + "grad_norm": 2.036782741546631, + "learning_rate": 2.2424756252649428e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8305172920227051, + "num_tokens": 20181632.0, + "step": 530 + }, + { + "epoch": 0.06754865793156087, + "ewc_loss": 1.296401023864746e-06, + "grad_norm": 2.0526325702667236, + "learning_rate": 2.2467147096227215e-07, + "loss": 0.5726, + "mean_token_accuracy": 0.8190598487854004, + "num_tokens": 20224169.0, + "step": 531 + }, + { + "epoch": 0.06767586821015138, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.166062831878662, + "learning_rate": 2.2509537939805002e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8440160155296326, + "num_tokens": 20261389.0, + "step": 532 + }, + { + "epoch": 0.0678030784887419, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.3545305728912354, + "learning_rate": 2.2551928783382787e-07, + "loss": 0.6032, + "mean_token_accuracy": 0.8133760690689087, + "num_tokens": 20297596.0, + "step": 533 + }, + { + "epoch": 0.0679302887673324, + "ewc_loss": 1.296401023864746e-06, + "grad_norm": 2.398725748062134, + "learning_rate": 2.2594319626960577e-07, + "loss": 0.5812, + "mean_token_accuracy": 0.8166254162788391, + "num_tokens": 20335929.0, + "step": 534 + }, + { + "epoch": 0.06805749904592291, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.113217830657959, + "learning_rate": 2.263671047053836e-07, + "loss": 0.5686, + "mean_token_accuracy": 0.8234732151031494, + "num_tokens": 20376132.0, + "step": 535 + }, + { + "epoch": 0.06818470932451343, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.2087550163269043, + "learning_rate": 2.267910131411615e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8236020803451538, + "num_tokens": 20411018.0, + "step": 536 + }, + { + "epoch": 0.06831191960310393, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.330768585205078, + "learning_rate": 2.2721492157693936e-07, + "loss": 0.5537, + "mean_token_accuracy": 0.8262166976928711, + "num_tokens": 20444633.0, + "step": 537 + }, + { + "epoch": 0.06843912988169444, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.1786909103393555, + "learning_rate": 2.2763883001271726e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8257300853729248, + "num_tokens": 20481316.0, + "step": 538 + }, + { + "epoch": 0.06856634016028496, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.0665183067321777, + "learning_rate": 2.280627384484951e-07, + "loss": 0.5397, + "mean_token_accuracy": 0.8285359740257263, + "num_tokens": 20517589.0, + "step": 539 + }, + { + "epoch": 0.06869355043887546, + "ewc_loss": 1.30385160446167e-06, + "grad_norm": 2.180227279663086, + "learning_rate": 2.28486646884273e-07, + "loss": 0.5493, + "mean_token_accuracy": 0.8283538818359375, + "num_tokens": 20552863.0, + "step": 540 + }, + { + "epoch": 0.06882076071746597, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 1.934961199760437, + "learning_rate": 2.2891055532005085e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8398712873458862, + "num_tokens": 20590500.0, + "step": 541 + }, + { + "epoch": 0.06894797099605648, + "ewc_loss": 1.3113021850585938e-06, + "grad_norm": 2.102386951446533, + "learning_rate": 2.2933446375582875e-07, + "loss": 0.5951, + "mean_token_accuracy": 0.8119643926620483, + "num_tokens": 20630659.0, + "step": 542 + }, + { + "epoch": 0.06907518127464699, + "ewc_loss": 1.3187527656555176e-06, + "grad_norm": 2.0616042613983154, + "learning_rate": 2.297583721916066e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8438701629638672, + "num_tokens": 20666515.0, + "step": 543 + }, + { + "epoch": 0.0692023915532375, + "ewc_loss": 1.3262033462524414e-06, + "grad_norm": 2.054194927215576, + "learning_rate": 2.301822806273845e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.8310209512710571, + "num_tokens": 20707319.0, + "step": 544 + }, + { + "epoch": 0.06932960183182801, + "ewc_loss": 1.3262033462524414e-06, + "grad_norm": 2.4256014823913574, + "learning_rate": 2.3060618906316234e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8365135192871094, + "num_tokens": 20738190.0, + "step": 545 + }, + { + "epoch": 0.06945681211041853, + "ewc_loss": 1.3336539268493652e-06, + "grad_norm": 2.058303117752075, + "learning_rate": 2.3103009749894024e-07, + "loss": 0.512, + "mean_token_accuracy": 0.8358901739120483, + "num_tokens": 20780714.0, + "step": 546 + }, + { + "epoch": 0.06958402238900903, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 2.924424886703491, + "learning_rate": 2.3145400593471808e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8334646224975586, + "num_tokens": 20813000.0, + "step": 547 + }, + { + "epoch": 0.06971123266759954, + "ewc_loss": 1.341104507446289e-06, + "grad_norm": 2.1058461666107178, + "learning_rate": 2.3187791437049598e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.836918830871582, + "num_tokens": 20846234.0, + "step": 548 + }, + { + "epoch": 0.06983844294619006, + "ewc_loss": 1.341104507446289e-06, + "grad_norm": 2.11806058883667, + "learning_rate": 2.3230182280627383e-07, + "loss": 0.5845, + "mean_token_accuracy": 0.8168871402740479, + "num_tokens": 20887139.0, + "step": 549 + }, + { + "epoch": 0.06996565322478056, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 2.175262689590454, + "learning_rate": 2.327257312420517e-07, + "loss": 0.6406, + "mean_token_accuracy": 0.8071737289428711, + "num_tokens": 20924573.0, + "step": 550 + }, + { + "epoch": 0.07009286350337107, + "ewc_loss": 1.3560056686401367e-06, + "grad_norm": 2.341859817504883, + "learning_rate": 2.3314963967782957e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.834078848361969, + "num_tokens": 20963980.0, + "step": 551 + }, + { + "epoch": 0.07022007378196159, + "ewc_loss": 1.3485550880432129e-06, + "grad_norm": 1.9337736368179321, + "learning_rate": 2.3357354811360745e-07, + "loss": 0.5493, + "mean_token_accuracy": 0.8294691443443298, + "num_tokens": 21006526.0, + "step": 552 + }, + { + "epoch": 0.07034728406055209, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 2.04329252243042, + "learning_rate": 2.3399745654938532e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8368204832077026, + "num_tokens": 21043277.0, + "step": 553 + }, + { + "epoch": 0.0704744943391426, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 2.2098755836486816, + "learning_rate": 2.344213649851632e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8368821144104004, + "num_tokens": 21077724.0, + "step": 554 + }, + { + "epoch": 0.07060170461773312, + "ewc_loss": 1.3709068298339844e-06, + "grad_norm": 1.8845173120498657, + "learning_rate": 2.3484527342094106e-07, + "loss": 0.5561, + "mean_token_accuracy": 0.82649827003479, + "num_tokens": 21120119.0, + "step": 555 + }, + { + "epoch": 0.07072891489632362, + "ewc_loss": 1.3783574104309082e-06, + "grad_norm": 2.1229240894317627, + "learning_rate": 2.3526918185671894e-07, + "loss": 0.5665, + "mean_token_accuracy": 0.8217898607254028, + "num_tokens": 21163860.0, + "step": 556 + }, + { + "epoch": 0.07085612517491413, + "ewc_loss": 1.3783574104309082e-06, + "grad_norm": 2.3803675174713135, + "learning_rate": 2.356930902924968e-07, + "loss": 0.538, + "mean_token_accuracy": 0.8298012018203735, + "num_tokens": 21200571.0, + "step": 557 + }, + { + "epoch": 0.07098333545350465, + "ewc_loss": 1.3932585716247559e-06, + "grad_norm": 2.1851909160614014, + "learning_rate": 2.3611699872827468e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8336342573165894, + "num_tokens": 21241024.0, + "step": 558 + }, + { + "epoch": 0.07111054573209516, + "ewc_loss": 1.3932585716247559e-06, + "grad_norm": 2.2931861877441406, + "learning_rate": 2.3654090716405255e-07, + "loss": 0.5771, + "mean_token_accuracy": 0.820731520652771, + "num_tokens": 21279595.0, + "step": 559 + }, + { + "epoch": 0.07123775601068566, + "ewc_loss": 1.3932585716247559e-06, + "grad_norm": 2.1243040561676025, + "learning_rate": 2.3696481559983043e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.8213444948196411, + "num_tokens": 21318208.0, + "step": 560 + }, + { + "epoch": 0.07136496628927617, + "ewc_loss": 1.4007091522216797e-06, + "grad_norm": 2.098254680633545, + "learning_rate": 2.373887240356083e-07, + "loss": 0.5735, + "mean_token_accuracy": 0.8153942227363586, + "num_tokens": 21357353.0, + "step": 561 + }, + { + "epoch": 0.07149217656786669, + "ewc_loss": 1.4007091522216797e-06, + "grad_norm": 2.2444918155670166, + "learning_rate": 2.3781263247138617e-07, + "loss": 0.5611, + "mean_token_accuracy": 0.822117805480957, + "num_tokens": 21395113.0, + "step": 562 + }, + { + "epoch": 0.07161938684645719, + "ewc_loss": 1.4081597328186035e-06, + "grad_norm": 2.2751402854919434, + "learning_rate": 2.3823654090716404e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8150414824485779, + "num_tokens": 21430253.0, + "step": 563 + }, + { + "epoch": 0.0717465971250477, + "ewc_loss": 1.4156103134155273e-06, + "grad_norm": 2.2880914211273193, + "learning_rate": 2.386604493429419e-07, + "loss": 0.5453, + "mean_token_accuracy": 0.825594425201416, + "num_tokens": 21462332.0, + "step": 564 + }, + { + "epoch": 0.07187380740363822, + "ewc_loss": 1.4230608940124512e-06, + "grad_norm": 2.3112902641296387, + "learning_rate": 2.390843577787198e-07, + "loss": 0.5185, + "mean_token_accuracy": 0.8343680500984192, + "num_tokens": 21495388.0, + "step": 565 + }, + { + "epoch": 0.07200101768222872, + "ewc_loss": 1.4379620552062988e-06, + "grad_norm": 1.9458328485488892, + "learning_rate": 2.3950826621449766e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8434664607048035, + "num_tokens": 21534056.0, + "step": 566 + }, + { + "epoch": 0.07212822796081923, + "ewc_loss": 1.4454126358032227e-06, + "grad_norm": 2.3916242122650146, + "learning_rate": 2.3993217465027556e-07, + "loss": 0.5551, + "mean_token_accuracy": 0.8243372440338135, + "num_tokens": 21564484.0, + "step": 567 + }, + { + "epoch": 0.07225543823940975, + "ewc_loss": 1.4528632164001465e-06, + "grad_norm": 2.1118688583374023, + "learning_rate": 2.403560830860534e-07, + "loss": 0.5127, + "mean_token_accuracy": 0.8338242173194885, + "num_tokens": 21605164.0, + "step": 568 + }, + { + "epoch": 0.07238264851800025, + "ewc_loss": 1.4677643775939941e-06, + "grad_norm": 2.0741875171661377, + "learning_rate": 2.4077999152183125e-07, + "loss": 0.5596, + "mean_token_accuracy": 0.8229972720146179, + "num_tokens": 21645488.0, + "step": 569 + }, + { + "epoch": 0.07250985879659076, + "ewc_loss": 1.4677643775939941e-06, + "grad_norm": 2.276185989379883, + "learning_rate": 2.4120389995760915e-07, + "loss": 0.5358, + "mean_token_accuracy": 0.8294578790664673, + "num_tokens": 21681406.0, + "step": 570 + }, + { + "epoch": 0.07263706907518128, + "ewc_loss": 1.4677643775939941e-06, + "grad_norm": 2.2316207885742188, + "learning_rate": 2.41627808393387e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.8234139084815979, + "num_tokens": 21714882.0, + "step": 571 + }, + { + "epoch": 0.07276427935377179, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 2.270766019821167, + "learning_rate": 2.420517168291649e-07, + "loss": 0.6061, + "mean_token_accuracy": 0.8084108829498291, + "num_tokens": 21749814.0, + "step": 572 + }, + { + "epoch": 0.07289148963236229, + "ewc_loss": 1.475214958190918e-06, + "grad_norm": 2.014078140258789, + "learning_rate": 2.4247562526494274e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.8308501243591309, + "num_tokens": 21788079.0, + "step": 573 + }, + { + "epoch": 0.0730186999109528, + "ewc_loss": 1.4826655387878418e-06, + "grad_norm": 1.8419851064682007, + "learning_rate": 2.4289953370072064e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8384067416191101, + "num_tokens": 21825981.0, + "step": 574 + }, + { + "epoch": 0.07314591018954332, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 2.092130184173584, + "learning_rate": 2.433234421364985e-07, + "loss": 0.5543, + "mean_token_accuracy": 0.8244748711585999, + "num_tokens": 21863669.0, + "step": 575 + }, + { + "epoch": 0.07327312046813382, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 2.052682638168335, + "learning_rate": 2.437473505722764e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8379800915718079, + "num_tokens": 21902236.0, + "step": 576 + }, + { + "epoch": 0.07340033074672433, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 2.0894317626953125, + "learning_rate": 2.4417125900805423e-07, + "loss": 0.5239, + "mean_token_accuracy": 0.8267391920089722, + "num_tokens": 21940028.0, + "step": 577 + }, + { + "epoch": 0.07352754102531485, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 2.230391502380371, + "learning_rate": 2.4459516744383213e-07, + "loss": 0.5545, + "mean_token_accuracy": 0.8264899253845215, + "num_tokens": 21978565.0, + "step": 578 + }, + { + "epoch": 0.07365475130390535, + "ewc_loss": 1.4901161193847656e-06, + "grad_norm": 1.9279987812042236, + "learning_rate": 2.4501907587961e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.8407822847366333, + "num_tokens": 22014715.0, + "step": 579 + }, + { + "epoch": 0.07378196158249586, + "ewc_loss": 1.5050172805786133e-06, + "grad_norm": 1.9893805980682373, + "learning_rate": 2.454429843153879e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8260282278060913, + "num_tokens": 22055082.0, + "step": 580 + }, + { + "epoch": 0.07390917186108638, + "ewc_loss": 1.5050172805786133e-06, + "grad_norm": 2.0061304569244385, + "learning_rate": 2.458668927511657e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8328123688697815, + "num_tokens": 22091982.0, + "step": 581 + }, + { + "epoch": 0.07403638213967688, + "ewc_loss": 1.5124678611755371e-06, + "grad_norm": 1.8366754055023193, + "learning_rate": 2.462908011869436e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.840939998626709, + "num_tokens": 22130683.0, + "step": 582 + }, + { + "epoch": 0.0741635924182674, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 1.9239082336425781, + "learning_rate": 2.4671470962272147e-07, + "loss": 0.5651, + "mean_token_accuracy": 0.8226551413536072, + "num_tokens": 22173076.0, + "step": 583 + }, + { + "epoch": 0.07429080269685791, + "ewc_loss": 1.5348196029663086e-06, + "grad_norm": 1.914602279663086, + "learning_rate": 2.4713861805849937e-07, + "loss": 0.5958, + "mean_token_accuracy": 0.81215900182724, + "num_tokens": 22211048.0, + "step": 584 + }, + { + "epoch": 0.07441801297544842, + "ewc_loss": 1.5497207641601562e-06, + "grad_norm": 2.4531209468841553, + "learning_rate": 2.475625264942772e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8506293296813965, + "num_tokens": 22235702.0, + "step": 585 + }, + { + "epoch": 0.07454522325403892, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.955997109413147, + "learning_rate": 2.479864349300551e-07, + "loss": 0.5406, + "mean_token_accuracy": 0.8243145942687988, + "num_tokens": 22270316.0, + "step": 586 + }, + { + "epoch": 0.07467243353262944, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 2.085130453109741, + "learning_rate": 2.4841034336583296e-07, + "loss": 0.5504, + "mean_token_accuracy": 0.8264119625091553, + "num_tokens": 22308189.0, + "step": 587 + }, + { + "epoch": 0.07479964381121995, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 2.0808053016662598, + "learning_rate": 2.488342518016108e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.8381556868553162, + "num_tokens": 22345872.0, + "step": 588 + }, + { + "epoch": 0.07492685408981045, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.840100884437561, + "learning_rate": 2.492581602373887e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8392179012298584, + "num_tokens": 22385884.0, + "step": 589 + }, + { + "epoch": 0.07505406436840097, + "ewc_loss": 1.564621925354004e-06, + "grad_norm": 1.933327555656433, + "learning_rate": 2.4968206867316655e-07, + "loss": 0.5051, + "mean_token_accuracy": 0.8370108604431152, + "num_tokens": 22419210.0, + "step": 590 + }, + { + "epoch": 0.07518127464699148, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.858150601387024, + "learning_rate": 2.5010597710894445e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8344025015830994, + "num_tokens": 22461266.0, + "step": 591 + }, + { + "epoch": 0.07530848492558198, + "ewc_loss": 1.5720725059509277e-06, + "grad_norm": 1.7906956672668457, + "learning_rate": 2.505298855447223e-07, + "loss": 0.5169, + "mean_token_accuracy": 0.8312923908233643, + "num_tokens": 22500185.0, + "step": 592 + }, + { + "epoch": 0.0754356952041725, + "ewc_loss": 1.5869736671447754e-06, + "grad_norm": 2.011239528656006, + "learning_rate": 2.509537939805002e-07, + "loss": 0.6434, + "mean_token_accuracy": 0.7952966690063477, + "num_tokens": 22544733.0, + "step": 593 + }, + { + "epoch": 0.07556290548276301, + "ewc_loss": 1.5869736671447754e-06, + "grad_norm": 2.06605863571167, + "learning_rate": 2.513777024162781e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8496701717376709, + "num_tokens": 22585239.0, + "step": 594 + }, + { + "epoch": 0.07569011576135352, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 2.021019220352173, + "learning_rate": 2.5180161085205594e-07, + "loss": 0.5216, + "mean_token_accuracy": 0.8345391750335693, + "num_tokens": 22630308.0, + "step": 595 + }, + { + "epoch": 0.07581732603994402, + "ewc_loss": 1.5944242477416992e-06, + "grad_norm": 1.7268377542495728, + "learning_rate": 2.522255192878338e-07, + "loss": 0.5039, + "mean_token_accuracy": 0.8394598960876465, + "num_tokens": 22671551.0, + "step": 596 + }, + { + "epoch": 0.07594453631853454, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 1.777101755142212, + "learning_rate": 2.526494277236117e-07, + "loss": 0.5321, + "mean_token_accuracy": 0.8283412456512451, + "num_tokens": 22707879.0, + "step": 597 + }, + { + "epoch": 0.07607174659712505, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 1.9386804103851318, + "learning_rate": 2.530733361593896e-07, + "loss": 0.5901, + "mean_token_accuracy": 0.814550518989563, + "num_tokens": 22750797.0, + "step": 598 + }, + { + "epoch": 0.07619895687571555, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 1.9819401502609253, + "learning_rate": 2.5349724459516743e-07, + "loss": 0.5472, + "mean_token_accuracy": 0.8308122158050537, + "num_tokens": 22786930.0, + "step": 599 + }, + { + "epoch": 0.07632616715430607, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 2.104522466659546, + "learning_rate": 2.539211530309453e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8334349393844604, + "num_tokens": 22822864.0, + "step": 600 + }, + { + "epoch": 0.07645337743289658, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 1.829825520515442, + "learning_rate": 2.543450614667232e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8385606408119202, + "num_tokens": 22861607.0, + "step": 601 + }, + { + "epoch": 0.07658058771148708, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 1.9169809818267822, + "learning_rate": 2.547689699025011e-07, + "loss": 0.5958, + "mean_token_accuracy": 0.8109107613563538, + "num_tokens": 22895381.0, + "step": 602 + }, + { + "epoch": 0.0767077979900776, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 2.0614664554595947, + "learning_rate": 2.551928783382789e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8324772715568542, + "num_tokens": 22931722.0, + "step": 603 + }, + { + "epoch": 0.07683500826866811, + "ewc_loss": 1.601874828338623e-06, + "grad_norm": 2.0182058811187744, + "learning_rate": 2.5561678677405677e-07, + "loss": 0.5926, + "mean_token_accuracy": 0.8116508722305298, + "num_tokens": 22970242.0, + "step": 604 + }, + { + "epoch": 0.07696221854725861, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 2.145199775695801, + "learning_rate": 2.5604069520983467e-07, + "loss": 0.5218, + "mean_token_accuracy": 0.8332844972610474, + "num_tokens": 23002512.0, + "step": 605 + }, + { + "epoch": 0.07708942882584913, + "ewc_loss": 1.6093254089355469e-06, + "grad_norm": 2.243166208267212, + "learning_rate": 2.564646036456125e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.8305574655532837, + "num_tokens": 23037015.0, + "step": 606 + }, + { + "epoch": 0.07721663910443964, + "ewc_loss": 1.6242265701293945e-06, + "grad_norm": 1.7222588062286377, + "learning_rate": 2.568885120813904e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.844291090965271, + "num_tokens": 23077967.0, + "step": 607 + }, + { + "epoch": 0.07734384938303016, + "ewc_loss": 1.6316771507263184e-06, + "grad_norm": 1.8064616918563843, + "learning_rate": 2.5731242051716826e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8306849002838135, + "num_tokens": 23120640.0, + "step": 608 + }, + { + "epoch": 0.07747105966162066, + "ewc_loss": 1.6316771507263184e-06, + "grad_norm": 1.749573826789856, + "learning_rate": 2.5773632895294616e-07, + "loss": 0.4699, + "mean_token_accuracy": 0.8434070348739624, + "num_tokens": 23161255.0, + "step": 609 + }, + { + "epoch": 0.07759826994021117, + "ewc_loss": 1.6316771507263184e-06, + "grad_norm": 1.8684546947479248, + "learning_rate": 2.58160237388724e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.822411060333252, + "num_tokens": 23197986.0, + "step": 610 + }, + { + "epoch": 0.07772548021880168, + "ewc_loss": 1.6540288925170898e-06, + "grad_norm": 1.9811551570892334, + "learning_rate": 2.585841458245019e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8401395678520203, + "num_tokens": 23230412.0, + "step": 611 + }, + { + "epoch": 0.07785269049739219, + "ewc_loss": 1.6614794731140137e-06, + "grad_norm": 1.7457326650619507, + "learning_rate": 2.5900805426027975e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.839423656463623, + "num_tokens": 23272550.0, + "step": 612 + }, + { + "epoch": 0.0779799007759827, + "ewc_loss": 1.6689300537109375e-06, + "grad_norm": 2.0392191410064697, + "learning_rate": 2.5943196269605765e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8269907832145691, + "num_tokens": 23307491.0, + "step": 613 + }, + { + "epoch": 0.07810711105457321, + "ewc_loss": 1.6689300537109375e-06, + "grad_norm": 1.9153512716293335, + "learning_rate": 2.598558711318355e-07, + "loss": 0.5257, + "mean_token_accuracy": 0.8334982395172119, + "num_tokens": 23346260.0, + "step": 614 + }, + { + "epoch": 0.07823432133316371, + "ewc_loss": 1.6689300537109375e-06, + "grad_norm": 1.812530279159546, + "learning_rate": 2.602797795676134e-07, + "loss": 0.5285, + "mean_token_accuracy": 0.832169771194458, + "num_tokens": 23383117.0, + "step": 615 + }, + { + "epoch": 0.07836153161175423, + "ewc_loss": 1.6763806343078613e-06, + "grad_norm": 1.8638856410980225, + "learning_rate": 2.6070368800339124e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8458207845687866, + "num_tokens": 23419696.0, + "step": 616 + }, + { + "epoch": 0.07848874189034474, + "ewc_loss": 1.6838312149047852e-06, + "grad_norm": 1.929760217666626, + "learning_rate": 2.6112759643916914e-07, + "loss": 0.554, + "mean_token_accuracy": 0.8208996057510376, + "num_tokens": 23454746.0, + "step": 617 + }, + { + "epoch": 0.07861595216893524, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.7596746683120728, + "learning_rate": 2.61551504874947e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.8280261754989624, + "num_tokens": 23495361.0, + "step": 618 + }, + { + "epoch": 0.07874316244752576, + "ewc_loss": 1.6987323760986328e-06, + "grad_norm": 1.9033151865005493, + "learning_rate": 2.619754133107249e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8346351385116577, + "num_tokens": 23530899.0, + "step": 619 + }, + { + "epoch": 0.07887037272611627, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 1.8188750743865967, + "learning_rate": 2.623993217465028e-07, + "loss": 0.527, + "mean_token_accuracy": 0.8325555324554443, + "num_tokens": 23568338.0, + "step": 620 + }, + { + "epoch": 0.07899758300470679, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 1.9269511699676514, + "learning_rate": 2.6282323018228063e-07, + "loss": 0.5174, + "mean_token_accuracy": 0.8358131051063538, + "num_tokens": 23606576.0, + "step": 621 + }, + { + "epoch": 0.07912479328329729, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 1.9987775087356567, + "learning_rate": 2.632471386180585e-07, + "loss": 0.5359, + "mean_token_accuracy": 0.8293682336807251, + "num_tokens": 23649726.0, + "step": 622 + }, + { + "epoch": 0.0792520035618878, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 1.8575410842895508, + "learning_rate": 2.6367104705383637e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.8380969762802124, + "num_tokens": 23687209.0, + "step": 623 + }, + { + "epoch": 0.07937921384047832, + "ewc_loss": 1.7061829566955566e-06, + "grad_norm": 1.9776275157928467, + "learning_rate": 2.6409495548961427e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8431136608123779, + "num_tokens": 23724614.0, + "step": 624 + }, + { + "epoch": 0.07950642411906882, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 1.8621654510498047, + "learning_rate": 2.6451886392539206e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.8275192379951477, + "num_tokens": 23764697.0, + "step": 625 + }, + { + "epoch": 0.07963363439765933, + "ewc_loss": 1.7136335372924805e-06, + "grad_norm": 1.8881001472473145, + "learning_rate": 2.6494277236116996e-07, + "loss": 0.5509, + "mean_token_accuracy": 0.8279148936271667, + "num_tokens": 23803869.0, + "step": 626 + }, + { + "epoch": 0.07976084467624985, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 1.6207412481307983, + "learning_rate": 2.6536668079694786e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8484796285629272, + "num_tokens": 23845926.0, + "step": 627 + }, + { + "epoch": 0.07988805495484035, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 1.724718689918518, + "learning_rate": 2.6579058923272576e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.84486985206604, + "num_tokens": 23887114.0, + "step": 628 + }, + { + "epoch": 0.08001526523343086, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 1.7607309818267822, + "learning_rate": 2.6621449766850356e-07, + "loss": 0.5144, + "mean_token_accuracy": 0.8366889357566833, + "num_tokens": 23926467.0, + "step": 629 + }, + { + "epoch": 0.08014247551202137, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 1.6223217248916626, + "learning_rate": 2.6663840610428145e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8377904891967773, + "num_tokens": 23969549.0, + "step": 630 + }, + { + "epoch": 0.08026968579061187, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 1.6712895631790161, + "learning_rate": 2.6706231454005935e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8469492197036743, + "num_tokens": 24009779.0, + "step": 631 + }, + { + "epoch": 0.08039689606920239, + "ewc_loss": 1.7210841178894043e-06, + "grad_norm": 1.755113959312439, + "learning_rate": 2.6748622297583725e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8362972736358643, + "num_tokens": 24048065.0, + "step": 632 + }, + { + "epoch": 0.0805241063477929, + "ewc_loss": 1.735985279083252e-06, + "grad_norm": 1.9528623819351196, + "learning_rate": 2.6791013141161505e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8192135691642761, + "num_tokens": 24082592.0, + "step": 633 + }, + { + "epoch": 0.08065131662638342, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 1.8102363348007202, + "learning_rate": 2.6833403984739294e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8312405347824097, + "num_tokens": 24118552.0, + "step": 634 + }, + { + "epoch": 0.08077852690497392, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 1.9957444667816162, + "learning_rate": 2.6875794828317084e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8469521999359131, + "num_tokens": 24154565.0, + "step": 635 + }, + { + "epoch": 0.08090573718356443, + "ewc_loss": 1.7434358596801758e-06, + "grad_norm": 1.7814576625823975, + "learning_rate": 2.6918185671894874e-07, + "loss": 0.5567, + "mean_token_accuracy": 0.8204380869865417, + "num_tokens": 24198389.0, + "step": 636 + }, + { + "epoch": 0.08103294746215495, + "ewc_loss": 1.7583370208740234e-06, + "grad_norm": 1.9905481338500977, + "learning_rate": 2.6960576515472654e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8380410671234131, + "num_tokens": 24235850.0, + "step": 637 + }, + { + "epoch": 0.08116015774074545, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 2.028038263320923, + "learning_rate": 2.7002967359050443e-07, + "loss": 0.5182, + "mean_token_accuracy": 0.831694483757019, + "num_tokens": 24266856.0, + "step": 638 + }, + { + "epoch": 0.08128736801933596, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 2.002044200897217, + "learning_rate": 2.7045358202628233e-07, + "loss": 0.5983, + "mean_token_accuracy": 0.8128337860107422, + "num_tokens": 24300958.0, + "step": 639 + }, + { + "epoch": 0.08141457829792648, + "ewc_loss": 1.7508864402770996e-06, + "grad_norm": 1.8123657703399658, + "learning_rate": 2.7087749046206023e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8335134387016296, + "num_tokens": 24339358.0, + "step": 640 + }, + { + "epoch": 0.08154178857651698, + "ewc_loss": 1.7583370208740234e-06, + "grad_norm": 1.6716699600219727, + "learning_rate": 2.71301398897838e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8409028053283691, + "num_tokens": 24384108.0, + "step": 641 + }, + { + "epoch": 0.08166899885510749, + "ewc_loss": 1.7657876014709473e-06, + "grad_norm": 1.664228916168213, + "learning_rate": 2.717253073336159e-07, + "loss": 0.4814, + "mean_token_accuracy": 0.8448566198348999, + "num_tokens": 24424958.0, + "step": 642 + }, + { + "epoch": 0.081796209133698, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 1.942671775817871, + "learning_rate": 2.721492157693938e-07, + "loss": 0.5728, + "mean_token_accuracy": 0.8166890740394592, + "num_tokens": 24464535.0, + "step": 643 + }, + { + "epoch": 0.0819234194122885, + "ewc_loss": 1.773238182067871e-06, + "grad_norm": 1.7817462682724, + "learning_rate": 2.7257312420517167e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8339003324508667, + "num_tokens": 24501969.0, + "step": 644 + }, + { + "epoch": 0.08205062969087902, + "ewc_loss": 1.780688762664795e-06, + "grad_norm": 1.8925833702087402, + "learning_rate": 2.729970326409495e-07, + "loss": 0.5891, + "mean_token_accuracy": 0.8093279600143433, + "num_tokens": 24543782.0, + "step": 645 + }, + { + "epoch": 0.08217783996946953, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 1.846946358680725, + "learning_rate": 2.734209410767274e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8398699164390564, + "num_tokens": 24579055.0, + "step": 646 + }, + { + "epoch": 0.08230505024806005, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 1.8651707172393799, + "learning_rate": 2.738448495125053e-07, + "loss": 0.5028, + "mean_token_accuracy": 0.8424699306488037, + "num_tokens": 24615913.0, + "step": 647 + }, + { + "epoch": 0.08243226052665055, + "ewc_loss": 1.7881393432617188e-06, + "grad_norm": 2.0022811889648438, + "learning_rate": 2.7426875794828316e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.8354439735412598, + "num_tokens": 24649612.0, + "step": 648 + }, + { + "epoch": 0.08255947080524106, + "ewc_loss": 1.7955899238586426e-06, + "grad_norm": 1.8182958364486694, + "learning_rate": 2.74692666384061e-07, + "loss": 0.5406, + "mean_token_accuracy": 0.8271850347518921, + "num_tokens": 24688118.0, + "step": 649 + }, + { + "epoch": 0.08268668108383158, + "ewc_loss": 1.8030405044555664e-06, + "grad_norm": 1.8030065298080444, + "learning_rate": 2.751165748198389e-07, + "loss": 0.5189, + "mean_token_accuracy": 0.8327747583389282, + "num_tokens": 24725457.0, + "step": 650 + }, + { + "epoch": 0.08281389136242208, + "ewc_loss": 1.8030405044555664e-06, + "grad_norm": 1.9012391567230225, + "learning_rate": 2.755404832556168e-07, + "loss": 0.5297, + "mean_token_accuracy": 0.827023983001709, + "num_tokens": 24766840.0, + "step": 651 + }, + { + "epoch": 0.0829411016410126, + "ewc_loss": 1.8030405044555664e-06, + "grad_norm": 1.6713491678237915, + "learning_rate": 2.7596439169139465e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.840909481048584, + "num_tokens": 24806671.0, + "step": 652 + }, + { + "epoch": 0.08306831191960311, + "ewc_loss": 1.8030405044555664e-06, + "grad_norm": 1.7563477754592896, + "learning_rate": 2.763883001271725e-07, + "loss": 0.5323, + "mean_token_accuracy": 0.8295464515686035, + "num_tokens": 24850928.0, + "step": 653 + }, + { + "epoch": 0.08319552219819361, + "ewc_loss": 1.8030405044555664e-06, + "grad_norm": 1.8154515027999878, + "learning_rate": 2.768122085629504e-07, + "loss": 0.4709, + "mean_token_accuracy": 0.8463991284370422, + "num_tokens": 24886812.0, + "step": 654 + }, + { + "epoch": 0.08332273247678412, + "ewc_loss": 1.8030405044555664e-06, + "grad_norm": 1.7244396209716797, + "learning_rate": 2.772361169987283e-07, + "loss": 0.5544, + "mean_token_accuracy": 0.8228168487548828, + "num_tokens": 24929560.0, + "step": 655 + }, + { + "epoch": 0.08344994275537464, + "ewc_loss": 1.8104910850524902e-06, + "grad_norm": 1.7629200220108032, + "learning_rate": 2.7766002543450614e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8308291435241699, + "num_tokens": 24969626.0, + "step": 656 + }, + { + "epoch": 0.08357715303396514, + "ewc_loss": 1.8104910850524902e-06, + "grad_norm": 1.795639157295227, + "learning_rate": 2.78083933870284e-07, + "loss": 0.5312, + "mean_token_accuracy": 0.8255236148834229, + "num_tokens": 25006804.0, + "step": 657 + }, + { + "epoch": 0.08370436331255565, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 1.6945453882217407, + "learning_rate": 2.785078423060619e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8356074690818787, + "num_tokens": 25043493.0, + "step": 658 + }, + { + "epoch": 0.08383157359114617, + "ewc_loss": 1.817941665649414e-06, + "grad_norm": 1.917248249053955, + "learning_rate": 2.789317507418398e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8406006693840027, + "num_tokens": 25075063.0, + "step": 659 + }, + { + "epoch": 0.08395878386973668, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.7969841957092285, + "learning_rate": 2.7935565917761763e-07, + "loss": 0.5243, + "mean_token_accuracy": 0.8337152004241943, + "num_tokens": 25112402.0, + "step": 660 + }, + { + "epoch": 0.08408599414832718, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.830858826637268, + "learning_rate": 2.797795676133955e-07, + "loss": 0.5602, + "mean_token_accuracy": 0.8245362639427185, + "num_tokens": 25154315.0, + "step": 661 + }, + { + "epoch": 0.0842132044269177, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.6038329601287842, + "learning_rate": 2.802034760491734e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8403640985488892, + "num_tokens": 25195443.0, + "step": 662 + }, + { + "epoch": 0.08434041470550821, + "ewc_loss": 1.8253922462463379e-06, + "grad_norm": 1.8637906312942505, + "learning_rate": 2.806273844849512e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8222718834877014, + "num_tokens": 25232600.0, + "step": 663 + }, + { + "epoch": 0.08446762498409871, + "ewc_loss": 1.8328428268432617e-06, + "grad_norm": 1.6438298225402832, + "learning_rate": 2.810512929207291e-07, + "loss": 0.4429, + "mean_token_accuracy": 0.8550217747688293, + "num_tokens": 25270131.0, + "step": 664 + }, + { + "epoch": 0.08459483526268922, + "ewc_loss": 1.8328428268432617e-06, + "grad_norm": 1.7329131364822388, + "learning_rate": 2.8147520135650697e-07, + "loss": 0.4807, + "mean_token_accuracy": 0.8465346693992615, + "num_tokens": 25314616.0, + "step": 665 + }, + { + "epoch": 0.08472204554127974, + "ewc_loss": 1.8328428268432617e-06, + "grad_norm": 1.926320195198059, + "learning_rate": 2.8189910979228487e-07, + "loss": 0.5218, + "mean_token_accuracy": 0.8340244889259338, + "num_tokens": 25347076.0, + "step": 666 + }, + { + "epoch": 0.08484925581987024, + "ewc_loss": 1.8328428268432617e-06, + "grad_norm": 1.694563865661621, + "learning_rate": 2.823230182280627e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8323819637298584, + "num_tokens": 25388360.0, + "step": 667 + }, + { + "epoch": 0.08497646609846075, + "ewc_loss": 1.8551945686340332e-06, + "grad_norm": 2.1066038608551025, + "learning_rate": 2.827469266638406e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8502837419509888, + "num_tokens": 25418449.0, + "step": 668 + }, + { + "epoch": 0.08510367637705127, + "ewc_loss": 1.862645149230957e-06, + "grad_norm": 1.842650294303894, + "learning_rate": 2.8317083509961846e-07, + "loss": 0.5086, + "mean_token_accuracy": 0.8363473415374756, + "num_tokens": 25453547.0, + "step": 669 + }, + { + "epoch": 0.08523088665564178, + "ewc_loss": 1.862645149230957e-06, + "grad_norm": 1.9171037673950195, + "learning_rate": 2.8359474353539636e-07, + "loss": 0.5711, + "mean_token_accuracy": 0.820997953414917, + "num_tokens": 25487552.0, + "step": 670 + }, + { + "epoch": 0.08535809693423228, + "ewc_loss": 1.8551945686340332e-06, + "grad_norm": 1.552829384803772, + "learning_rate": 2.840186519711742e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.855872631072998, + "num_tokens": 25526627.0, + "step": 671 + }, + { + "epoch": 0.0854853072128228, + "ewc_loss": 1.8700957298278809e-06, + "grad_norm": 1.7030891180038452, + "learning_rate": 2.844425604069521e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8388422727584839, + "num_tokens": 25565544.0, + "step": 672 + }, + { + "epoch": 0.08561251749141331, + "ewc_loss": 1.8700957298278809e-06, + "grad_norm": 2.007573366165161, + "learning_rate": 2.8486646884272995e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.8183116316795349, + "num_tokens": 25600876.0, + "step": 673 + }, + { + "epoch": 0.08573972777000381, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 1.8893039226531982, + "learning_rate": 2.8529037727850785e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8461970090866089, + "num_tokens": 25639620.0, + "step": 674 + }, + { + "epoch": 0.08586693804859433, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 1.7626715898513794, + "learning_rate": 2.857142857142857e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8471537828445435, + "num_tokens": 25673639.0, + "step": 675 + }, + { + "epoch": 0.08599414832718484, + "ewc_loss": 1.8775463104248047e-06, + "grad_norm": 1.7028961181640625, + "learning_rate": 2.861381941500636e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.833453357219696, + "num_tokens": 25713694.0, + "step": 676 + }, + { + "epoch": 0.08612135860577534, + "ewc_loss": 1.8849968910217285e-06, + "grad_norm": 2.0036087036132812, + "learning_rate": 2.8656210258584144e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8167834281921387, + "num_tokens": 25745459.0, + "step": 677 + }, + { + "epoch": 0.08624856888436586, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.7156490087509155, + "learning_rate": 2.869860110216193e-07, + "loss": 0.528, + "mean_token_accuracy": 0.8313436508178711, + "num_tokens": 25787297.0, + "step": 678 + }, + { + "epoch": 0.08637577916295637, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.8640650510787964, + "learning_rate": 2.874099194573972e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8316243290901184, + "num_tokens": 25826709.0, + "step": 679 + }, + { + "epoch": 0.08650298944154687, + "ewc_loss": 1.8924474716186523e-06, + "grad_norm": 1.8284786939620972, + "learning_rate": 2.878338278931751e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8393547534942627, + "num_tokens": 25861973.0, + "step": 680 + }, + { + "epoch": 0.08663019972013739, + "ewc_loss": 1.8998980522155762e-06, + "grad_norm": 1.6591862440109253, + "learning_rate": 2.8825773632895293e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8428149819374084, + "num_tokens": 25901390.0, + "step": 681 + }, + { + "epoch": 0.0867574099987279, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.755057454109192, + "learning_rate": 2.886816447647308e-07, + "loss": 0.5893, + "mean_token_accuracy": 0.8109519481658936, + "num_tokens": 25941533.0, + "step": 682 + }, + { + "epoch": 0.08688462027731841, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.6779828071594238, + "learning_rate": 2.891055532005087e-07, + "loss": 0.4699, + "mean_token_accuracy": 0.8469542860984802, + "num_tokens": 25980042.0, + "step": 683 + }, + { + "epoch": 0.08701183055590891, + "ewc_loss": 1.9073486328125e-06, + "grad_norm": 1.7616745233535767, + "learning_rate": 2.8952946163628657e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8308022618293762, + "num_tokens": 26020466.0, + "step": 684 + }, + { + "epoch": 0.08713904083449943, + "ewc_loss": 1.9222497940063477e-06, + "grad_norm": 1.6955815553665161, + "learning_rate": 2.899533700720644e-07, + "loss": 0.5198, + "mean_token_accuracy": 0.8328168392181396, + "num_tokens": 26057479.0, + "step": 685 + }, + { + "epoch": 0.08726625111308994, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 1.7516251802444458, + "learning_rate": 2.9037727850784227e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8407050371170044, + "num_tokens": 26095185.0, + "step": 686 + }, + { + "epoch": 0.08739346139168044, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 1.8757537603378296, + "learning_rate": 2.9080118694362016e-07, + "loss": 0.503, + "mean_token_accuracy": 0.8398213386535645, + "num_tokens": 26130967.0, + "step": 687 + }, + { + "epoch": 0.08752067167027096, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 1.7593371868133545, + "learning_rate": 2.9122509537939806e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8340650796890259, + "num_tokens": 26167198.0, + "step": 688 + }, + { + "epoch": 0.08764788194886147, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 1.8405507802963257, + "learning_rate": 2.916490038151759e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8422831892967224, + "num_tokens": 26198493.0, + "step": 689 + }, + { + "epoch": 0.08777509222745197, + "ewc_loss": 1.9371509552001953e-06, + "grad_norm": 1.7345542907714844, + "learning_rate": 2.9207291225095376e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.8316705822944641, + "num_tokens": 26234509.0, + "step": 690 + }, + { + "epoch": 0.08790230250604249, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 1.7662562131881714, + "learning_rate": 2.9249682068673166e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8324459791183472, + "num_tokens": 26269705.0, + "step": 691 + }, + { + "epoch": 0.088029512784633, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 1.8397561311721802, + "learning_rate": 2.9292072912250955e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.832496166229248, + "num_tokens": 26309452.0, + "step": 692 + }, + { + "epoch": 0.0881567230632235, + "ewc_loss": 1.952052116394043e-06, + "grad_norm": 1.6933679580688477, + "learning_rate": 2.933446375582874e-07, + "loss": 0.5163, + "mean_token_accuracy": 0.8350271582603455, + "num_tokens": 26351487.0, + "step": 693 + }, + { + "epoch": 0.08828393334181402, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.6141088008880615, + "learning_rate": 2.9376854599406525e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8501498699188232, + "num_tokens": 26392303.0, + "step": 694 + }, + { + "epoch": 0.08841114362040453, + "ewc_loss": 1.9669532775878906e-06, + "grad_norm": 1.7208589315414429, + "learning_rate": 2.9419245442984315e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8281062245368958, + "num_tokens": 26430576.0, + "step": 695 + }, + { + "epoch": 0.08853835389899505, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.8630610704421997, + "learning_rate": 2.9461636286562104e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8332215547561646, + "num_tokens": 26467606.0, + "step": 696 + }, + { + "epoch": 0.08866556417758555, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.7674940824508667, + "learning_rate": 2.9504027130139884e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8399369716644287, + "num_tokens": 26503192.0, + "step": 697 + }, + { + "epoch": 0.08879277445617606, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.7991284132003784, + "learning_rate": 2.9546417973717674e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.8295825719833374, + "num_tokens": 26541786.0, + "step": 698 + }, + { + "epoch": 0.08891998473476657, + "ewc_loss": 1.996755599975586e-06, + "grad_norm": 1.8283170461654663, + "learning_rate": 2.9588808817295464e-07, + "loss": 0.519, + "mean_token_accuracy": 0.8366199731826782, + "num_tokens": 26580088.0, + "step": 699 + }, + { + "epoch": 0.08904719501335707, + "ewc_loss": 2.0116567611694336e-06, + "grad_norm": 1.953891396522522, + "learning_rate": 2.9631199660873253e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8333853483200073, + "num_tokens": 26614795.0, + "step": 700 + }, + { + "epoch": 0.08917440529194759, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.8600876331329346, + "learning_rate": 2.9673590504451033e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8409739136695862, + "num_tokens": 26646253.0, + "step": 701 + }, + { + "epoch": 0.0893016155705381, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.7001286745071411, + "learning_rate": 2.9715981348028823e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8423844575881958, + "num_tokens": 26683195.0, + "step": 702 + }, + { + "epoch": 0.0894288258491286, + "ewc_loss": 2.0265579223632812e-06, + "grad_norm": 1.686256766319275, + "learning_rate": 2.975837219160661e-07, + "loss": 0.515, + "mean_token_accuracy": 0.8339217305183411, + "num_tokens": 26723460.0, + "step": 703 + }, + { + "epoch": 0.08955603612771912, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 1.8533947467803955, + "learning_rate": 2.98007630351844e-07, + "loss": 0.5426, + "mean_token_accuracy": 0.8254421353340149, + "num_tokens": 26766093.0, + "step": 704 + }, + { + "epoch": 0.08968324640630963, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 1.8424745798110962, + "learning_rate": 2.984315387876218e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8322647213935852, + "num_tokens": 26801331.0, + "step": 705 + }, + { + "epoch": 0.08981045668490013, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 1.8257297277450562, + "learning_rate": 2.988554472233997e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8238303661346436, + "num_tokens": 26837869.0, + "step": 706 + }, + { + "epoch": 0.08993766696349065, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 1.6228286027908325, + "learning_rate": 2.992793556591776e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.8326268196105957, + "num_tokens": 26881948.0, + "step": 707 + }, + { + "epoch": 0.09006487724208116, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 1.5694340467453003, + "learning_rate": 2.997032640949555e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8427519798278809, + "num_tokens": 26924137.0, + "step": 708 + }, + { + "epoch": 0.09019208752067168, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 1.6544408798217773, + "learning_rate": 3.001271725307333e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8377029895782471, + "num_tokens": 26965548.0, + "step": 709 + }, + { + "epoch": 0.09031929779926218, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 1.7456856966018677, + "learning_rate": 3.005510809665112e-07, + "loss": 0.5495, + "mean_token_accuracy": 0.8215822577476501, + "num_tokens": 27003048.0, + "step": 710 + }, + { + "epoch": 0.09044650807785269, + "ewc_loss": 2.0563602447509766e-06, + "grad_norm": 1.8092739582061768, + "learning_rate": 3.009749894022891e-07, + "loss": 0.5678, + "mean_token_accuracy": 0.820809543132782, + "num_tokens": 27045598.0, + "step": 711 + }, + { + "epoch": 0.0905737183564432, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.8316978216171265, + "learning_rate": 3.01398897838067e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8458859324455261, + "num_tokens": 27082757.0, + "step": 712 + }, + { + "epoch": 0.0907009286350337, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 2.0766468048095703, + "learning_rate": 3.018228062738448e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8380287885665894, + "num_tokens": 27112083.0, + "step": 713 + }, + { + "epoch": 0.09082813891362422, + "ewc_loss": 2.0712614059448242e-06, + "grad_norm": 1.674977421760559, + "learning_rate": 3.022467147096227e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8371400237083435, + "num_tokens": 27150623.0, + "step": 714 + }, + { + "epoch": 0.09095534919221474, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.7746864557266235, + "learning_rate": 3.026706231454006e-07, + "loss": 0.5337, + "mean_token_accuracy": 0.8285583257675171, + "num_tokens": 27189767.0, + "step": 715 + }, + { + "epoch": 0.09108255947080524, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.7904819250106812, + "learning_rate": 3.0309453158117844e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.8436535596847534, + "num_tokens": 27225999.0, + "step": 716 + }, + { + "epoch": 0.09120976974939575, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.7131025791168213, + "learning_rate": 3.035184400169563e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8423027992248535, + "num_tokens": 27265305.0, + "step": 717 + }, + { + "epoch": 0.09133698002798626, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.670055866241455, + "learning_rate": 3.039423484527342e-07, + "loss": 0.5698, + "mean_token_accuracy": 0.8173978924751282, + "num_tokens": 27307314.0, + "step": 718 + }, + { + "epoch": 0.09146419030657676, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.6982451677322388, + "learning_rate": 3.043662568885121e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8255537748336792, + "num_tokens": 27343963.0, + "step": 719 + }, + { + "epoch": 0.09159140058516728, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.6026376485824585, + "learning_rate": 3.0479016532428993e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8507286310195923, + "num_tokens": 27383757.0, + "step": 720 + }, + { + "epoch": 0.0917186108637578, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.6834301948547363, + "learning_rate": 3.052140737600678e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.836739718914032, + "num_tokens": 27423110.0, + "step": 721 + }, + { + "epoch": 0.09184582114234831, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.6571857929229736, + "learning_rate": 3.056379821958457e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8445995450019836, + "num_tokens": 27463211.0, + "step": 722 + }, + { + "epoch": 0.09197303142093881, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.7613284587860107, + "learning_rate": 3.060618906316236e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.8268427848815918, + "num_tokens": 27499362.0, + "step": 723 + }, + { + "epoch": 0.09210024169952932, + "ewc_loss": 2.086162567138672e-06, + "grad_norm": 1.648962140083313, + "learning_rate": 3.064857990674014e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8384861946105957, + "num_tokens": 27540093.0, + "step": 724 + }, + { + "epoch": 0.09222745197811984, + "ewc_loss": 2.1010637283325195e-06, + "grad_norm": 1.7095764875411987, + "learning_rate": 3.0690970750317927e-07, + "loss": 0.48, + "mean_token_accuracy": 0.8420610427856445, + "num_tokens": 27581269.0, + "step": 725 + }, + { + "epoch": 0.09235466225671034, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.8156777620315552, + "learning_rate": 3.0733361593895717e-07, + "loss": 0.5155, + "mean_token_accuracy": 0.8315858840942383, + "num_tokens": 27619017.0, + "step": 726 + }, + { + "epoch": 0.09248187253530085, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.6392552852630615, + "learning_rate": 3.0775752437473507e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8457328081130981, + "num_tokens": 27661568.0, + "step": 727 + }, + { + "epoch": 0.09260908281389137, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.7138501405715942, + "learning_rate": 3.081814328105129e-07, + "loss": 0.5187, + "mean_token_accuracy": 0.8356993198394775, + "num_tokens": 27697846.0, + "step": 728 + }, + { + "epoch": 0.09273629309248187, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.594581961631775, + "learning_rate": 3.0860534124629076e-07, + "loss": 0.5397, + "mean_token_accuracy": 0.8255060911178589, + "num_tokens": 27742964.0, + "step": 729 + }, + { + "epoch": 0.09286350337107238, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.8598885536193848, + "learning_rate": 3.0902924968206866e-07, + "loss": 0.5455, + "mean_token_accuracy": 0.8292656540870667, + "num_tokens": 27780702.0, + "step": 730 + }, + { + "epoch": 0.0929907136496629, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.7081680297851562, + "learning_rate": 3.0945315811784656e-07, + "loss": 0.4922, + "mean_token_accuracy": 0.8378031849861145, + "num_tokens": 27819669.0, + "step": 731 + }, + { + "epoch": 0.0931179239282534, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.6507543325424194, + "learning_rate": 3.098770665536244e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.8289112448692322, + "num_tokens": 27860076.0, + "step": 732 + }, + { + "epoch": 0.09324513420684391, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.4793822765350342, + "learning_rate": 3.1030097498940225e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8383092284202576, + "num_tokens": 27902748.0, + "step": 733 + }, + { + "epoch": 0.09337234448543442, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.8766696453094482, + "learning_rate": 3.1072488342518015e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.825594425201416, + "num_tokens": 27937917.0, + "step": 734 + }, + { + "epoch": 0.09349955476402494, + "ewc_loss": 2.115964889526367e-06, + "grad_norm": 1.7548826932907104, + "learning_rate": 3.11148791860958e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.8369391560554504, + "num_tokens": 27974054.0, + "step": 735 + }, + { + "epoch": 0.09362676504261544, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 1.7358105182647705, + "learning_rate": 3.115727002967359e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8289202451705933, + "num_tokens": 28015423.0, + "step": 736 + }, + { + "epoch": 0.09375397532120595, + "ewc_loss": 2.130866050720215e-06, + "grad_norm": 1.793803334236145, + "learning_rate": 3.1199660873251374e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8369714021682739, + "num_tokens": 28054041.0, + "step": 737 + }, + { + "epoch": 0.09388118559979647, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 1.6466604471206665, + "learning_rate": 3.1242051716829164e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8443339467048645, + "num_tokens": 28091680.0, + "step": 738 + }, + { + "epoch": 0.09400839587838697, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 1.6255183219909668, + "learning_rate": 3.128444256040695e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.8315572738647461, + "num_tokens": 28134857.0, + "step": 739 + }, + { + "epoch": 0.09413560615697748, + "ewc_loss": 2.16066837310791e-06, + "grad_norm": 1.712229609489441, + "learning_rate": 3.132683340398474e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.844251811504364, + "num_tokens": 28174506.0, + "step": 740 + }, + { + "epoch": 0.094262816435568, + "ewc_loss": 2.175569534301758e-06, + "grad_norm": 1.739879846572876, + "learning_rate": 3.1369224247562523e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8379618525505066, + "num_tokens": 28213412.0, + "step": 741 + }, + { + "epoch": 0.0943900267141585, + "ewc_loss": 2.175569534301758e-06, + "grad_norm": 1.6075778007507324, + "learning_rate": 3.1411615091140313e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8365764617919922, + "num_tokens": 28254789.0, + "step": 742 + }, + { + "epoch": 0.09451723699274901, + "ewc_loss": 2.175569534301758e-06, + "grad_norm": 1.749468445777893, + "learning_rate": 3.14540059347181e-07, + "loss": 0.5479, + "mean_token_accuracy": 0.8247748613357544, + "num_tokens": 28292659.0, + "step": 743 + }, + { + "epoch": 0.09464444727133953, + "ewc_loss": 2.175569534301758e-06, + "grad_norm": 1.8450967073440552, + "learning_rate": 3.149639677829589e-07, + "loss": 0.5917, + "mean_token_accuracy": 0.8086713552474976, + "num_tokens": 28328137.0, + "step": 744 + }, + { + "epoch": 0.09477165754993004, + "ewc_loss": 2.175569534301758e-06, + "grad_norm": 1.8302639722824097, + "learning_rate": 3.153878762187368e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8375410437583923, + "num_tokens": 28362354.0, + "step": 745 + }, + { + "epoch": 0.09489886782852054, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.8712347745895386, + "learning_rate": 3.158117846545146e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.814354419708252, + "num_tokens": 28396730.0, + "step": 746 + }, + { + "epoch": 0.09502607810711106, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.5775911808013916, + "learning_rate": 3.1623569309029247e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8456137180328369, + "num_tokens": 28436091.0, + "step": 747 + }, + { + "epoch": 0.09515328838570157, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.745071291923523, + "learning_rate": 3.1665960152607037e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8367290496826172, + "num_tokens": 28473123.0, + "step": 748 + }, + { + "epoch": 0.09528049866429207, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.690796971321106, + "learning_rate": 3.1708350996184826e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8440561294555664, + "num_tokens": 28514971.0, + "step": 749 + }, + { + "epoch": 0.09540770894288259, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.7577053308486938, + "learning_rate": 3.175074183976261e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8390277624130249, + "num_tokens": 28549600.0, + "step": 750 + }, + { + "epoch": 0.0955349192214731, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.8003785610198975, + "learning_rate": 3.1793132683340396e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.8236250877380371, + "num_tokens": 28585805.0, + "step": 751 + }, + { + "epoch": 0.0956621295000636, + "ewc_loss": 2.1904706954956055e-06, + "grad_norm": 1.7054851055145264, + "learning_rate": 3.1835523526918186e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.829799234867096, + "num_tokens": 28623167.0, + "step": 752 + }, + { + "epoch": 0.09578933977865411, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.7755177021026611, + "learning_rate": 3.1877914370495975e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8363138437271118, + "num_tokens": 28659304.0, + "step": 753 + }, + { + "epoch": 0.09591655005724463, + "ewc_loss": 2.2202730178833008e-06, + "grad_norm": 1.706195592880249, + "learning_rate": 3.1920305214073755e-07, + "loss": 0.5384, + "mean_token_accuracy": 0.8250446319580078, + "num_tokens": 28696807.0, + "step": 754 + }, + { + "epoch": 0.09604376033583513, + "ewc_loss": 2.2351741790771484e-06, + "grad_norm": 1.782495141029358, + "learning_rate": 3.1962696057651545e-07, + "loss": 0.5596, + "mean_token_accuracy": 0.8219550848007202, + "num_tokens": 28739026.0, + "step": 755 + }, + { + "epoch": 0.09617097061442564, + "ewc_loss": 2.2351741790771484e-06, + "grad_norm": 1.9834908246994019, + "learning_rate": 3.2005086901229335e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8320313096046448, + "num_tokens": 28769106.0, + "step": 756 + }, + { + "epoch": 0.09629818089301616, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.7203434705734253, + "learning_rate": 3.2047477744807125e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8182332515716553, + "num_tokens": 28807400.0, + "step": 757 + }, + { + "epoch": 0.09642539117160667, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.691781759262085, + "learning_rate": 3.2089868588384904e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8469884395599365, + "num_tokens": 28841765.0, + "step": 758 + }, + { + "epoch": 0.09655260145019717, + "ewc_loss": 2.250075340270996e-06, + "grad_norm": 1.8022215366363525, + "learning_rate": 3.2132259431962694e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8251620531082153, + "num_tokens": 28881805.0, + "step": 759 + }, + { + "epoch": 0.09667981172878769, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.8557997941970825, + "learning_rate": 3.2174650275540484e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8306025862693787, + "num_tokens": 28917565.0, + "step": 760 + }, + { + "epoch": 0.0968070220073782, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.7012883424758911, + "learning_rate": 3.2217041119118274e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.8453772068023682, + "num_tokens": 28951143.0, + "step": 761 + }, + { + "epoch": 0.0969342322859687, + "ewc_loss": 2.2649765014648438e-06, + "grad_norm": 1.7067080736160278, + "learning_rate": 3.2259431962696053e-07, + "loss": 0.5116, + "mean_token_accuracy": 0.8349702954292297, + "num_tokens": 28987978.0, + "step": 762 + }, + { + "epoch": 0.09706144256455922, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.6954014301300049, + "learning_rate": 3.2301822806273843e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8362877368927002, + "num_tokens": 29025341.0, + "step": 763 + }, + { + "epoch": 0.09718865284314973, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.8711390495300293, + "learning_rate": 3.2344213649851633e-07, + "loss": 0.5332, + "mean_token_accuracy": 0.8277649283409119, + "num_tokens": 29058185.0, + "step": 764 + }, + { + "epoch": 0.09731586312174023, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.5765504837036133, + "learning_rate": 3.238660449342942e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.845542311668396, + "num_tokens": 29097540.0, + "step": 765 + }, + { + "epoch": 0.09744307340033075, + "ewc_loss": 2.2798776626586914e-06, + "grad_norm": 1.7130106687545776, + "learning_rate": 3.24289953370072e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.8369542956352234, + "num_tokens": 29133099.0, + "step": 766 + }, + { + "epoch": 0.09757028367892126, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.580651879310608, + "learning_rate": 3.247138618058499e-07, + "loss": 0.549, + "mean_token_accuracy": 0.8269352912902832, + "num_tokens": 29178893.0, + "step": 767 + }, + { + "epoch": 0.09769749395751176, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.6672766208648682, + "learning_rate": 3.251377702416278e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.8452637195587158, + "num_tokens": 29214791.0, + "step": 768 + }, + { + "epoch": 0.09782470423610228, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.6967198848724365, + "learning_rate": 3.255616786774057e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8411223888397217, + "num_tokens": 29251252.0, + "step": 769 + }, + { + "epoch": 0.09795191451469279, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.7462680339813232, + "learning_rate": 3.259855871131835e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8493480086326599, + "num_tokens": 29285666.0, + "step": 770 + }, + { + "epoch": 0.0980791247932833, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.5646190643310547, + "learning_rate": 3.264094955489614e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8459466695785522, + "num_tokens": 29328670.0, + "step": 771 + }, + { + "epoch": 0.0982063350718738, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.5953476428985596, + "learning_rate": 3.268334039847393e-07, + "loss": 0.5384, + "mean_token_accuracy": 0.8263200521469116, + "num_tokens": 29367942.0, + "step": 772 + }, + { + "epoch": 0.09833354535046432, + "ewc_loss": 2.294778823852539e-06, + "grad_norm": 1.8293979167938232, + "learning_rate": 3.2725731242051715e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8377986550331116, + "num_tokens": 29402051.0, + "step": 773 + }, + { + "epoch": 0.09846075562905483, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.656401515007019, + "learning_rate": 3.27681220856295e-07, + "loss": 0.5409, + "mean_token_accuracy": 0.8320811986923218, + "num_tokens": 29442832.0, + "step": 774 + }, + { + "epoch": 0.09858796590764533, + "ewc_loss": 2.3096799850463867e-06, + "grad_norm": 1.8007426261901855, + "learning_rate": 3.281051292920729e-07, + "loss": 0.5598, + "mean_token_accuracy": 0.8216938376426697, + "num_tokens": 29480015.0, + "step": 775 + }, + { + "epoch": 0.09871517618623585, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.9302092790603638, + "learning_rate": 3.285290377278508e-07, + "loss": 0.548, + "mean_token_accuracy": 0.82545006275177, + "num_tokens": 29515326.0, + "step": 776 + }, + { + "epoch": 0.09884238646482636, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.6218736171722412, + "learning_rate": 3.2895294616362864e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.841217577457428, + "num_tokens": 29554117.0, + "step": 777 + }, + { + "epoch": 0.09896959674341686, + "ewc_loss": 2.3245811462402344e-06, + "grad_norm": 1.5969692468643188, + "learning_rate": 3.293768545994065e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8487624526023865, + "num_tokens": 29592413.0, + "step": 778 + }, + { + "epoch": 0.09909680702200738, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 1.6996140480041504, + "learning_rate": 3.298007630351844e-07, + "loss": 0.4991, + "mean_token_accuracy": 0.8369057178497314, + "num_tokens": 29631574.0, + "step": 779 + }, + { + "epoch": 0.09922401730059789, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 1.654342532157898, + "learning_rate": 3.302246714709623e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8426375389099121, + "num_tokens": 29669099.0, + "step": 780 + }, + { + "epoch": 0.09935122757918839, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 1.661877989768982, + "learning_rate": 3.3064857990674013e-07, + "loss": 0.5237, + "mean_token_accuracy": 0.8314887881278992, + "num_tokens": 29708489.0, + "step": 781 + }, + { + "epoch": 0.0994784378577789, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 1.6900445222854614, + "learning_rate": 3.31072488342518e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8416907787322998, + "num_tokens": 29743586.0, + "step": 782 + }, + { + "epoch": 0.09960564813636942, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.8095426559448242, + "learning_rate": 3.314963967782959e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8315957188606262, + "num_tokens": 29778977.0, + "step": 783 + }, + { + "epoch": 0.09973285841495994, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 1.616797685623169, + "learning_rate": 3.319203052140738e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.8271617293357849, + "num_tokens": 29819567.0, + "step": 784 + }, + { + "epoch": 0.09986006869355044, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.7001689672470093, + "learning_rate": 3.323442136498516e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8448165655136108, + "num_tokens": 29855404.0, + "step": 785 + }, + { + "epoch": 0.09998727897214095, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.6923774480819702, + "learning_rate": 3.3276812208562947e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8321288824081421, + "num_tokens": 29895262.0, + "step": 786 + }, + { + "epoch": 0.10011448925073146, + "ewc_loss": 2.339482307434082e-06, + "grad_norm": 1.5967484712600708, + "learning_rate": 3.3319203052140737e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8452180027961731, + "num_tokens": 29936018.0, + "step": 787 + }, + { + "epoch": 0.10024169952932196, + "ewc_loss": 2.3543834686279297e-06, + "grad_norm": 1.7974077463150024, + "learning_rate": 3.336159389571852e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8379479646682739, + "num_tokens": 29969779.0, + "step": 788 + }, + { + "epoch": 0.10036890980791248, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.7654787302017212, + "learning_rate": 3.340398473929631e-07, + "loss": 0.579, + "mean_token_accuracy": 0.8197364807128906, + "num_tokens": 30008716.0, + "step": 789 + }, + { + "epoch": 0.100496120086503, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.622269630432129, + "learning_rate": 3.3446375582874096e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8349388241767883, + "num_tokens": 30052524.0, + "step": 790 + }, + { + "epoch": 0.1006233303650935, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.741662859916687, + "learning_rate": 3.3488766426451886e-07, + "loss": 0.4626, + "mean_token_accuracy": 0.8502885103225708, + "num_tokens": 30090227.0, + "step": 791 + }, + { + "epoch": 0.10075054064368401, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.7114378213882446, + "learning_rate": 3.353115727002967e-07, + "loss": 0.4811, + "mean_token_accuracy": 0.844031572341919, + "num_tokens": 30127727.0, + "step": 792 + }, + { + "epoch": 0.10087775092227452, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 2.127870798110962, + "learning_rate": 3.357354811360746e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8366249203681946, + "num_tokens": 30166291.0, + "step": 793 + }, + { + "epoch": 0.10100496120086502, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.6377629041671753, + "learning_rate": 3.3615938957185245e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8353502750396729, + "num_tokens": 30205099.0, + "step": 794 + }, + { + "epoch": 0.10113217147945554, + "ewc_loss": 2.3692846298217773e-06, + "grad_norm": 1.7574163675308228, + "learning_rate": 3.3658329800763035e-07, + "loss": 0.488, + "mean_token_accuracy": 0.845416784286499, + "num_tokens": 30243999.0, + "step": 795 + }, + { + "epoch": 0.10125938175804605, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 1.8053852319717407, + "learning_rate": 3.370072064434082e-07, + "loss": 0.5393, + "mean_token_accuracy": 0.8230932950973511, + "num_tokens": 30280537.0, + "step": 796 + }, + { + "epoch": 0.10138659203663657, + "ewc_loss": 2.3990869522094727e-06, + "grad_norm": 1.8225672245025635, + "learning_rate": 3.374311148791861e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8411020040512085, + "num_tokens": 30314369.0, + "step": 797 + }, + { + "epoch": 0.10151380231522707, + "ewc_loss": 2.4139881134033203e-06, + "grad_norm": 1.652470588684082, + "learning_rate": 3.3785502331496394e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8370734453201294, + "num_tokens": 30357370.0, + "step": 798 + }, + { + "epoch": 0.10164101259381758, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.6037557125091553, + "learning_rate": 3.3827893175074184e-07, + "loss": 0.5327, + "mean_token_accuracy": 0.8282737731933594, + "num_tokens": 30400914.0, + "step": 799 + }, + { + "epoch": 0.1017682228724081, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.6413062810897827, + "learning_rate": 3.387028401865197e-07, + "loss": 0.4991, + "mean_token_accuracy": 0.8427581787109375, + "num_tokens": 30438508.0, + "step": 800 + }, + { + "epoch": 0.1018954331509986, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.6739628314971924, + "learning_rate": 3.391267486222976e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8385746479034424, + "num_tokens": 30479011.0, + "step": 801 + }, + { + "epoch": 0.10202264342958911, + "ewc_loss": 2.4437904357910156e-06, + "grad_norm": 1.4195441007614136, + "learning_rate": 3.3955065705807543e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8572360277175903, + "num_tokens": 30521760.0, + "step": 802 + }, + { + "epoch": 0.10214985370817962, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 1.6402558088302612, + "learning_rate": 3.3997456549385333e-07, + "loss": 0.502, + "mean_token_accuracy": 0.8376452922821045, + "num_tokens": 30561529.0, + "step": 803 + }, + { + "epoch": 0.10227706398677013, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 1.6533244848251343, + "learning_rate": 3.403984739296312e-07, + "loss": 0.5374, + "mean_token_accuracy": 0.826157808303833, + "num_tokens": 30601510.0, + "step": 804 + }, + { + "epoch": 0.10240427426536064, + "ewc_loss": 2.4586915969848633e-06, + "grad_norm": 1.5952565670013428, + "learning_rate": 3.408223823654091e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.8478020429611206, + "num_tokens": 30644869.0, + "step": 805 + }, + { + "epoch": 0.10253148454395115, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 1.668266773223877, + "learning_rate": 3.412462908011869e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.850002646446228, + "num_tokens": 30682325.0, + "step": 806 + }, + { + "epoch": 0.10265869482254165, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 1.7269188165664673, + "learning_rate": 3.4167019923696477e-07, + "loss": 0.5187, + "mean_token_accuracy": 0.8345156908035278, + "num_tokens": 30721799.0, + "step": 807 + }, + { + "epoch": 0.10278590510113217, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 1.870212435722351, + "learning_rate": 3.4209410767274267e-07, + "loss": 0.5507, + "mean_token_accuracy": 0.819415807723999, + "num_tokens": 30758490.0, + "step": 808 + }, + { + "epoch": 0.10291311537972268, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 1.7119181156158447, + "learning_rate": 3.4251801610852057e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.825869083404541, + "num_tokens": 30800115.0, + "step": 809 + }, + { + "epoch": 0.1030403256583132, + "ewc_loss": 2.473592758178711e-06, + "grad_norm": 1.7731728553771973, + "learning_rate": 3.429419245442984e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8400890827178955, + "num_tokens": 30836829.0, + "step": 810 + }, + { + "epoch": 0.1031675359369037, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 1.664763331413269, + "learning_rate": 3.4336583298007626e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8532036542892456, + "num_tokens": 30874884.0, + "step": 811 + }, + { + "epoch": 0.10329474621549421, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 2.021843671798706, + "learning_rate": 3.4378974141585416e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8366831541061401, + "num_tokens": 30903944.0, + "step": 812 + }, + { + "epoch": 0.10342195649408473, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 1.7096116542816162, + "learning_rate": 3.4421364985163206e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8499646782875061, + "num_tokens": 30939892.0, + "step": 813 + }, + { + "epoch": 0.10354916677267523, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 1.5521215200424194, + "learning_rate": 3.446375582874099e-07, + "loss": 0.4755, + "mean_token_accuracy": 0.8449732065200806, + "num_tokens": 30980055.0, + "step": 814 + }, + { + "epoch": 0.10367637705126574, + "ewc_loss": 2.4884939193725586e-06, + "grad_norm": 1.7913979291915894, + "learning_rate": 3.4506146672318775e-07, + "loss": 0.5482, + "mean_token_accuracy": 0.8238401412963867, + "num_tokens": 31017669.0, + "step": 815 + }, + { + "epoch": 0.10380358732985626, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.5796222686767578, + "learning_rate": 3.4548537515896565e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8367456793785095, + "num_tokens": 31056738.0, + "step": 816 + }, + { + "epoch": 0.10393079760844676, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.6495189666748047, + "learning_rate": 3.4590928359474355e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8316726088523865, + "num_tokens": 31094718.0, + "step": 817 + }, + { + "epoch": 0.10405800788703727, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.6212815046310425, + "learning_rate": 3.463331920305214e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8381746411323547, + "num_tokens": 31135246.0, + "step": 818 + }, + { + "epoch": 0.10418521816562779, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.6586635112762451, + "learning_rate": 3.4675710046629924e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8436475992202759, + "num_tokens": 31171724.0, + "step": 819 + }, + { + "epoch": 0.1043124284442183, + "ewc_loss": 2.5033950805664062e-06, + "grad_norm": 1.902024745941162, + "learning_rate": 3.4718100890207714e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8471270799636841, + "num_tokens": 31204427.0, + "step": 820 + }, + { + "epoch": 0.1044396387228088, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.690415382385254, + "learning_rate": 3.4760491733785504e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.8436103463172913, + "num_tokens": 31241246.0, + "step": 821 + }, + { + "epoch": 0.10456684900139931, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.7010796070098877, + "learning_rate": 3.480288257736329e-07, + "loss": 0.593, + "mean_token_accuracy": 0.814087986946106, + "num_tokens": 31285802.0, + "step": 822 + }, + { + "epoch": 0.10469405927998983, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.5938746929168701, + "learning_rate": 3.4845273420941073e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8323142528533936, + "num_tokens": 31327133.0, + "step": 823 + }, + { + "epoch": 0.10482126955858033, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.7347761392593384, + "learning_rate": 3.4887664264518863e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8371185064315796, + "num_tokens": 31363816.0, + "step": 824 + }, + { + "epoch": 0.10494847983717084, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.7793858051300049, + "learning_rate": 3.4930055108096653e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8388581275939941, + "num_tokens": 31398311.0, + "step": 825 + }, + { + "epoch": 0.10507569011576136, + "ewc_loss": 2.518296241760254e-06, + "grad_norm": 1.603177547454834, + "learning_rate": 3.497244595167443e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8305891752243042, + "num_tokens": 31444402.0, + "step": 826 + }, + { + "epoch": 0.10520290039435186, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 1.5555708408355713, + "learning_rate": 3.501483679525222e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8426876068115234, + "num_tokens": 31482491.0, + "step": 827 + }, + { + "epoch": 0.10533011067294237, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 1.417351484298706, + "learning_rate": 3.505722763883001e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8456829190254211, + "num_tokens": 31528083.0, + "step": 828 + }, + { + "epoch": 0.10545732095153289, + "ewc_loss": 2.5331974029541016e-06, + "grad_norm": 1.6449565887451172, + "learning_rate": 3.50996184824078e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.8453508615493774, + "num_tokens": 31566502.0, + "step": 829 + }, + { + "epoch": 0.10558453123012339, + "ewc_loss": 2.5480985641479492e-06, + "grad_norm": 1.69855535030365, + "learning_rate": 3.514200932598558e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.8265609741210938, + "num_tokens": 31603289.0, + "step": 830 + }, + { + "epoch": 0.1057117415087139, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.5416043996810913, + "learning_rate": 3.518440016956337e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8547812700271606, + "num_tokens": 31642843.0, + "step": 831 + }, + { + "epoch": 0.10583895178730442, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.6932756900787354, + "learning_rate": 3.522679101314116e-07, + "loss": 0.5053, + "mean_token_accuracy": 0.8356745839118958, + "num_tokens": 31678703.0, + "step": 832 + }, + { + "epoch": 0.10596616206589493, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.651678204536438, + "learning_rate": 3.526918185671895e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8435627222061157, + "num_tokens": 31716275.0, + "step": 833 + }, + { + "epoch": 0.10609337234448543, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.5218039751052856, + "learning_rate": 3.531157270029673e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8380089998245239, + "num_tokens": 31760616.0, + "step": 834 + }, + { + "epoch": 0.10622058262307595, + "ewc_loss": 2.562999725341797e-06, + "grad_norm": 1.7498700618743896, + "learning_rate": 3.535396354387452e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8408050537109375, + "num_tokens": 31796061.0, + "step": 835 + }, + { + "epoch": 0.10634779290166646, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.5126862525939941, + "learning_rate": 3.539635438745231e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.831132173538208, + "num_tokens": 31839527.0, + "step": 836 + }, + { + "epoch": 0.10647500318025696, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.6404372453689575, + "learning_rate": 3.54387452310301e-07, + "loss": 0.573, + "mean_token_accuracy": 0.8150231838226318, + "num_tokens": 31881570.0, + "step": 837 + }, + { + "epoch": 0.10660221345884748, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.685492753982544, + "learning_rate": 3.548113607460788e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8496087789535522, + "num_tokens": 31917952.0, + "step": 838 + }, + { + "epoch": 0.10672942373743799, + "ewc_loss": 2.60770320892334e-06, + "grad_norm": 1.5984749794006348, + "learning_rate": 3.552352691818567e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8373072147369385, + "num_tokens": 31953677.0, + "step": 839 + }, + { + "epoch": 0.10685663401602849, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 1.5627185106277466, + "learning_rate": 3.556591776176346e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.844467043876648, + "num_tokens": 31995277.0, + "step": 840 + }, + { + "epoch": 0.106983844294619, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 1.4902918338775635, + "learning_rate": 3.560830860534125e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.832443356513977, + "num_tokens": 32039254.0, + "step": 841 + }, + { + "epoch": 0.10711105457320952, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 1.645283579826355, + "learning_rate": 3.565069944891903e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8480280637741089, + "num_tokens": 32076885.0, + "step": 842 + }, + { + "epoch": 0.10723826485180002, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 1.6728299856185913, + "learning_rate": 3.569309029249682e-07, + "loss": 0.4348, + "mean_token_accuracy": 0.857759952545166, + "num_tokens": 32112024.0, + "step": 843 + }, + { + "epoch": 0.10736547513039053, + "ewc_loss": 2.6226043701171875e-06, + "grad_norm": 1.7032383680343628, + "learning_rate": 3.573548113607461e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8464164733886719, + "num_tokens": 32147895.0, + "step": 844 + }, + { + "epoch": 0.10749268540898105, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 1.5469732284545898, + "learning_rate": 3.577787197965239e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.849471926689148, + "num_tokens": 32189082.0, + "step": 845 + }, + { + "epoch": 0.10761989568757156, + "ewc_loss": 2.637505531311035e-06, + "grad_norm": 1.6332883834838867, + "learning_rate": 3.5820262823230177e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.848198413848877, + "num_tokens": 32226995.0, + "step": 846 + }, + { + "epoch": 0.10774710596616206, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.632477879524231, + "learning_rate": 3.5862653666807967e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.8320061564445496, + "num_tokens": 32267920.0, + "step": 847 + }, + { + "epoch": 0.10787431624475258, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.6288793087005615, + "learning_rate": 3.5905044510385757e-07, + "loss": 0.5533, + "mean_token_accuracy": 0.8242552280426025, + "num_tokens": 32309123.0, + "step": 848 + }, + { + "epoch": 0.10800152652334309, + "ewc_loss": 2.652406692504883e-06, + "grad_norm": 1.5745830535888672, + "learning_rate": 3.594743535396354e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8488860130310059, + "num_tokens": 32347550.0, + "step": 849 + }, + { + "epoch": 0.10812873680193359, + "ewc_loss": 2.6673078536987305e-06, + "grad_norm": 1.6684010028839111, + "learning_rate": 3.5989826197541326e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8426768183708191, + "num_tokens": 32387881.0, + "step": 850 + }, + { + "epoch": 0.1082559470805241, + "ewc_loss": 2.6673078536987305e-06, + "grad_norm": 1.674188256263733, + "learning_rate": 3.6032217041119116e-07, + "loss": 0.5226, + "mean_token_accuracy": 0.8353328704833984, + "num_tokens": 32429570.0, + "step": 851 + }, + { + "epoch": 0.10838315735911462, + "ewc_loss": 2.6673078536987305e-06, + "grad_norm": 1.434914469718933, + "learning_rate": 3.6074607884696906e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8404254913330078, + "num_tokens": 32480079.0, + "step": 852 + }, + { + "epoch": 0.10851036763770512, + "ewc_loss": 2.6673078536987305e-06, + "grad_norm": 1.7104142904281616, + "learning_rate": 3.611699872827469e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8376235365867615, + "num_tokens": 32514125.0, + "step": 853 + }, + { + "epoch": 0.10863757791629564, + "ewc_loss": 2.6673078536987305e-06, + "grad_norm": 1.6137856245040894, + "learning_rate": 3.6159389571852475e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.8426297903060913, + "num_tokens": 32555673.0, + "step": 854 + }, + { + "epoch": 0.10876478819488615, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 1.774040699005127, + "learning_rate": 3.6201780415430265e-07, + "loss": 0.5576, + "mean_token_accuracy": 0.8287957310676575, + "num_tokens": 32593259.0, + "step": 855 + }, + { + "epoch": 0.10889199847347665, + "ewc_loss": 2.682209014892578e-06, + "grad_norm": 1.6410599946975708, + "learning_rate": 3.6244171259008055e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8498412370681763, + "num_tokens": 32629954.0, + "step": 856 + }, + { + "epoch": 0.10901920875206716, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.6405109167099, + "learning_rate": 3.628656210258584e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8449221253395081, + "num_tokens": 32667202.0, + "step": 857 + }, + { + "epoch": 0.10914641903065768, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.8064501285552979, + "learning_rate": 3.6328952946163624e-07, + "loss": 0.5213, + "mean_token_accuracy": 0.8311347961425781, + "num_tokens": 32702360.0, + "step": 858 + }, + { + "epoch": 0.1092736293092482, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.6355886459350586, + "learning_rate": 3.6371343789741414e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8287295699119568, + "num_tokens": 32742817.0, + "step": 859 + }, + { + "epoch": 0.1094008395878387, + "ewc_loss": 2.6971101760864258e-06, + "grad_norm": 1.6860016584396362, + "learning_rate": 3.6413734633319204e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.843660831451416, + "num_tokens": 32778534.0, + "step": 860 + }, + { + "epoch": 0.10952804986642921, + "ewc_loss": 2.7120113372802734e-06, + "grad_norm": 1.6512633562088013, + "learning_rate": 3.645612547689699e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.840687096118927, + "num_tokens": 32813486.0, + "step": 861 + }, + { + "epoch": 0.10965526014501972, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 1.6602672338485718, + "learning_rate": 3.6498516320474773e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8483889102935791, + "num_tokens": 32847192.0, + "step": 862 + }, + { + "epoch": 0.10978247042361022, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 1.5471278429031372, + "learning_rate": 3.6540907164052563e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8375062942504883, + "num_tokens": 32888447.0, + "step": 863 + }, + { + "epoch": 0.10990968070220074, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 1.7929540872573853, + "learning_rate": 3.658329800763035e-07, + "loss": 0.5759, + "mean_token_accuracy": 0.8143199682235718, + "num_tokens": 32926684.0, + "step": 864 + }, + { + "epoch": 0.11003689098079125, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 1.5670299530029297, + "learning_rate": 3.662568885120814e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.835690975189209, + "num_tokens": 32968688.0, + "step": 865 + }, + { + "epoch": 0.11016410125938175, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 1.902208685874939, + "learning_rate": 3.666807969478592e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.8233145475387573, + "num_tokens": 33000176.0, + "step": 866 + }, + { + "epoch": 0.11029131153797227, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 1.5559639930725098, + "learning_rate": 3.671047053836371e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8357337713241577, + "num_tokens": 33042328.0, + "step": 867 + }, + { + "epoch": 0.11041852181656278, + "ewc_loss": 2.7418136596679688e-06, + "grad_norm": 1.8769192695617676, + "learning_rate": 3.6752861381941497e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8396098613739014, + "num_tokens": 33076161.0, + "step": 868 + }, + { + "epoch": 0.11054573209515328, + "ewc_loss": 2.726912498474121e-06, + "grad_norm": 1.9316540956497192, + "learning_rate": 3.6795252225519287e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8283059597015381, + "num_tokens": 33105109.0, + "step": 869 + }, + { + "epoch": 0.1106729423737438, + "ewc_loss": 2.7418136596679688e-06, + "grad_norm": 1.6955963373184204, + "learning_rate": 3.6837643069097077e-07, + "loss": 0.5204, + "mean_token_accuracy": 0.8335675001144409, + "num_tokens": 33141566.0, + "step": 870 + }, + { + "epoch": 0.11080015265233431, + "ewc_loss": 2.7418136596679688e-06, + "grad_norm": 1.6615188121795654, + "learning_rate": 3.688003391267486e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8400163650512695, + "num_tokens": 33177051.0, + "step": 871 + }, + { + "epoch": 0.11092736293092482, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.8167723417282104, + "learning_rate": 3.6922424756252646e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8285672664642334, + "num_tokens": 33208346.0, + "step": 872 + }, + { + "epoch": 0.11105457320951533, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.5368213653564453, + "learning_rate": 3.6964815599830436e-07, + "loss": 0.4308, + "mean_token_accuracy": 0.8565531969070435, + "num_tokens": 33247785.0, + "step": 873 + }, + { + "epoch": 0.11118178348810584, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.6913416385650635, + "learning_rate": 3.7007206443408226e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8449119329452515, + "num_tokens": 33285002.0, + "step": 874 + }, + { + "epoch": 0.11130899376669635, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.5660935640335083, + "learning_rate": 3.704959728698601e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8482166528701782, + "num_tokens": 33325825.0, + "step": 875 + }, + { + "epoch": 0.11143620404528685, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.7444922924041748, + "learning_rate": 3.7091988130563795e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.843011736869812, + "num_tokens": 33361271.0, + "step": 876 + }, + { + "epoch": 0.11156341432387737, + "ewc_loss": 2.7567148208618164e-06, + "grad_norm": 1.8181310892105103, + "learning_rate": 3.7134378974141585e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8366613388061523, + "num_tokens": 33392419.0, + "step": 877 + }, + { + "epoch": 0.11169062460246788, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 1.701302409172058, + "learning_rate": 3.7176769817719375e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8428775668144226, + "num_tokens": 33430207.0, + "step": 878 + }, + { + "epoch": 0.11181783488105838, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 1.5361231565475464, + "learning_rate": 3.7219160661297154e-07, + "loss": 0.4823, + "mean_token_accuracy": 0.8439934849739075, + "num_tokens": 33474098.0, + "step": 879 + }, + { + "epoch": 0.1119450451596489, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 1.7429301738739014, + "learning_rate": 3.7261551504874944e-07, + "loss": 0.4991, + "mean_token_accuracy": 0.8376896977424622, + "num_tokens": 33509282.0, + "step": 880 + }, + { + "epoch": 0.11207225543823941, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 1.8528923988342285, + "learning_rate": 3.7303942348452734e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8441023826599121, + "num_tokens": 33542981.0, + "step": 881 + }, + { + "epoch": 0.11219946571682991, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 1.5231469869613647, + "learning_rate": 3.7346333192030524e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8469433784484863, + "num_tokens": 33583622.0, + "step": 882 + }, + { + "epoch": 0.11232667599542043, + "ewc_loss": 2.771615982055664e-06, + "grad_norm": 1.5817115306854248, + "learning_rate": 3.7388724035608303e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8462176322937012, + "num_tokens": 33621579.0, + "step": 883 + }, + { + "epoch": 0.11245388627401094, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.6438572406768799, + "learning_rate": 3.7431114879186093e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.8526662588119507, + "num_tokens": 33657725.0, + "step": 884 + }, + { + "epoch": 0.11258109655260146, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.823256492614746, + "learning_rate": 3.7473505722763883e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8442885875701904, + "num_tokens": 33695663.0, + "step": 885 + }, + { + "epoch": 0.11270830683119196, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.678245186805725, + "learning_rate": 3.7515896566341673e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8473003506660461, + "num_tokens": 33729708.0, + "step": 886 + }, + { + "epoch": 0.11283551710978247, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.6759823560714722, + "learning_rate": 3.755828740991945e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.8565884828567505, + "num_tokens": 33768168.0, + "step": 887 + }, + { + "epoch": 0.11296272738837299, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.5235944986343384, + "learning_rate": 3.760067825349724e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8322851657867432, + "num_tokens": 33813920.0, + "step": 888 + }, + { + "epoch": 0.11308993766696349, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.592911958694458, + "learning_rate": 3.764306909707503e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.84217369556427, + "num_tokens": 33851252.0, + "step": 889 + }, + { + "epoch": 0.113217147945554, + "ewc_loss": 2.7865171432495117e-06, + "grad_norm": 1.731829047203064, + "learning_rate": 3.768545994065282e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8384960889816284, + "num_tokens": 33884951.0, + "step": 890 + }, + { + "epoch": 0.11334435822414451, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.7559641599655151, + "learning_rate": 3.77278507842306e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8391048312187195, + "num_tokens": 33920236.0, + "step": 891 + }, + { + "epoch": 0.11347156850273502, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.605237603187561, + "learning_rate": 3.777024162780839e-07, + "loss": 0.4914, + "mean_token_accuracy": 0.841913104057312, + "num_tokens": 33960370.0, + "step": 892 + }, + { + "epoch": 0.11359877878132553, + "ewc_loss": 2.8014183044433594e-06, + "grad_norm": 1.627814769744873, + "learning_rate": 3.781263247138618e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8306509256362915, + "num_tokens": 34000008.0, + "step": 893 + }, + { + "epoch": 0.11372598905991604, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 1.6818302869796753, + "learning_rate": 3.785502331496397e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8394496440887451, + "num_tokens": 34037582.0, + "step": 894 + }, + { + "epoch": 0.11385319933850654, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 1.7241514921188354, + "learning_rate": 3.789741415854175e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8466472625732422, + "num_tokens": 34070901.0, + "step": 895 + }, + { + "epoch": 0.11398040961709706, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.510974645614624, + "learning_rate": 3.793980500211954e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.856629490852356, + "num_tokens": 34112775.0, + "step": 896 + }, + { + "epoch": 0.11410761989568757, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.4994616508483887, + "learning_rate": 3.798219584569733e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.8371013402938843, + "num_tokens": 34157309.0, + "step": 897 + }, + { + "epoch": 0.11423483017427809, + "ewc_loss": 2.8312206268310547e-06, + "grad_norm": 1.7017910480499268, + "learning_rate": 3.8024586689275115e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8425570726394653, + "num_tokens": 34192201.0, + "step": 898 + }, + { + "epoch": 0.11436204045286859, + "ewc_loss": 2.816319465637207e-06, + "grad_norm": 1.474327564239502, + "learning_rate": 3.80669775328529e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8489063382148743, + "num_tokens": 34233324.0, + "step": 899 + }, + { + "epoch": 0.1144892507314591, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.6882835626602173, + "learning_rate": 3.810936837643069e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.836029052734375, + "num_tokens": 34273863.0, + "step": 900 + }, + { + "epoch": 0.11461646101004962, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.657719612121582, + "learning_rate": 3.815175922000848e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.841916561126709, + "num_tokens": 34312381.0, + "step": 901 + }, + { + "epoch": 0.11474367128864012, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.6512523889541626, + "learning_rate": 3.8194150063586264e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8439669609069824, + "num_tokens": 34347552.0, + "step": 902 + }, + { + "epoch": 0.11487088156723063, + "ewc_loss": 2.8461217880249023e-06, + "grad_norm": 1.530638575553894, + "learning_rate": 3.823654090716405e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.8313874006271362, + "num_tokens": 34389417.0, + "step": 903 + }, + { + "epoch": 0.11499809184582115, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 1.6375809907913208, + "learning_rate": 3.827893175074184e-07, + "loss": 0.4493, + "mean_token_accuracy": 0.8526887893676758, + "num_tokens": 34424833.0, + "step": 904 + }, + { + "epoch": 0.11512530212441165, + "ewc_loss": 2.86102294921875e-06, + "grad_norm": 1.7995167970657349, + "learning_rate": 3.832132259431963e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8325384855270386, + "num_tokens": 34462012.0, + "step": 905 + }, + { + "epoch": 0.11525251240300216, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 1.6451644897460938, + "learning_rate": 3.8363713437897413e-07, + "loss": 0.5243, + "mean_token_accuracy": 0.8278869986534119, + "num_tokens": 34500700.0, + "step": 906 + }, + { + "epoch": 0.11537972268159268, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 1.7233330011367798, + "learning_rate": 3.8406104281475197e-07, + "loss": 0.4642, + "mean_token_accuracy": 0.8454102277755737, + "num_tokens": 34536977.0, + "step": 907 + }, + { + "epoch": 0.11550693296018319, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 1.6029771566390991, + "learning_rate": 3.8448495125052987e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8560400605201721, + "num_tokens": 34574077.0, + "step": 908 + }, + { + "epoch": 0.11563414323877369, + "ewc_loss": 2.8759241104125977e-06, + "grad_norm": 1.5535868406295776, + "learning_rate": 3.8490885968630777e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8453058004379272, + "num_tokens": 34615597.0, + "step": 909 + }, + { + "epoch": 0.1157613535173642, + "ewc_loss": 2.8908252716064453e-06, + "grad_norm": 1.8102511167526245, + "learning_rate": 3.853327681220856e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8405399322509766, + "num_tokens": 34649956.0, + "step": 910 + }, + { + "epoch": 0.11588856379595472, + "ewc_loss": 2.8908252716064453e-06, + "grad_norm": 1.6820805072784424, + "learning_rate": 3.8575667655786346e-07, + "loss": 0.4382, + "mean_token_accuracy": 0.8549371957778931, + "num_tokens": 34683399.0, + "step": 911 + }, + { + "epoch": 0.11601577407454522, + "ewc_loss": 2.8908252716064453e-06, + "grad_norm": 1.7017390727996826, + "learning_rate": 3.8618058499364136e-07, + "loss": 0.4824, + "mean_token_accuracy": 0.8414692878723145, + "num_tokens": 34718660.0, + "step": 912 + }, + { + "epoch": 0.11614298435313573, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.5256778001785278, + "learning_rate": 3.8660449342941926e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8486215472221375, + "num_tokens": 34759209.0, + "step": 913 + }, + { + "epoch": 0.11627019463172625, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.636276364326477, + "learning_rate": 3.870284018651971e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8374443054199219, + "num_tokens": 34797953.0, + "step": 914 + }, + { + "epoch": 0.11639740491031675, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.8059840202331543, + "learning_rate": 3.8745231030097495e-07, + "loss": 0.4708, + "mean_token_accuracy": 0.8473016023635864, + "num_tokens": 34827513.0, + "step": 915 + }, + { + "epoch": 0.11652461518890726, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.5524946451187134, + "learning_rate": 3.8787621873675285e-07, + "loss": 0.4372, + "mean_token_accuracy": 0.8620872497558594, + "num_tokens": 34867088.0, + "step": 916 + }, + { + "epoch": 0.11665182546749778, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.625980019569397, + "learning_rate": 3.883001271725307e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8394089937210083, + "num_tokens": 34906581.0, + "step": 917 + }, + { + "epoch": 0.11677903574608828, + "ewc_loss": 2.905726432800293e-06, + "grad_norm": 1.6066484451293945, + "learning_rate": 3.887240356083086e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8391409516334534, + "num_tokens": 34946378.0, + "step": 918 + }, + { + "epoch": 0.11690624602467879, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.5400967597961426, + "learning_rate": 3.8914794404408644e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.8348467350006104, + "num_tokens": 34990477.0, + "step": 919 + }, + { + "epoch": 0.1170334563032693, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.7484050989151, + "learning_rate": 3.8957185247986434e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8429486155509949, + "num_tokens": 35023145.0, + "step": 920 + }, + { + "epoch": 0.11716066658185982, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.8923988342285156, + "learning_rate": 3.899957609156422e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8411864638328552, + "num_tokens": 35055653.0, + "step": 921 + }, + { + "epoch": 0.11728787686045032, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.6485520601272583, + "learning_rate": 3.904196693514201e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8543699979782104, + "num_tokens": 35095052.0, + "step": 922 + }, + { + "epoch": 0.11741508713904084, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.6002620458602905, + "learning_rate": 3.9084357778719793e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8431416153907776, + "num_tokens": 35134374.0, + "step": 923 + }, + { + "epoch": 0.11754229741763135, + "ewc_loss": 2.9206275939941406e-06, + "grad_norm": 1.6657308340072632, + "learning_rate": 3.9126748622297583e-07, + "loss": 0.4758, + "mean_token_accuracy": 0.8431024551391602, + "num_tokens": 35171091.0, + "step": 924 + }, + { + "epoch": 0.11766950769622185, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.8328299522399902, + "learning_rate": 3.916913946587537e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8287561535835266, + "num_tokens": 35206626.0, + "step": 925 + }, + { + "epoch": 0.11779671797481236, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.7110207080841064, + "learning_rate": 3.921153030945316e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8578931093215942, + "num_tokens": 35242871.0, + "step": 926 + }, + { + "epoch": 0.11792392825340288, + "ewc_loss": 2.9355287551879883e-06, + "grad_norm": 1.5999640226364136, + "learning_rate": 3.925392115303094e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8443064093589783, + "num_tokens": 35281779.0, + "step": 927 + }, + { + "epoch": 0.11805113853199338, + "ewc_loss": 2.950429916381836e-06, + "grad_norm": 1.3872755765914917, + "learning_rate": 3.929631199660873e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8423563241958618, + "num_tokens": 35332590.0, + "step": 928 + }, + { + "epoch": 0.1181783488105839, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.554531455039978, + "learning_rate": 3.9338702840186517e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8507405519485474, + "num_tokens": 35370014.0, + "step": 929 + }, + { + "epoch": 0.11830555908917441, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.6991413831710815, + "learning_rate": 3.9381093683764307e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8515713214874268, + "num_tokens": 35404749.0, + "step": 930 + }, + { + "epoch": 0.11843276936776491, + "ewc_loss": 2.9653310775756836e-06, + "grad_norm": 1.5154368877410889, + "learning_rate": 3.942348452734209e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8550776243209839, + "num_tokens": 35443806.0, + "step": 931 + }, + { + "epoch": 0.11855997964635542, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 1.7510145902633667, + "learning_rate": 3.946587537091988e-07, + "loss": 0.5226, + "mean_token_accuracy": 0.8309682607650757, + "num_tokens": 35480860.0, + "step": 932 + }, + { + "epoch": 0.11868718992494594, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 1.6323978900909424, + "learning_rate": 3.9508266214497666e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8417584896087646, + "num_tokens": 35518783.0, + "step": 933 + }, + { + "epoch": 0.11881440020353645, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 1.6975146532058716, + "learning_rate": 3.9550657058075456e-07, + "loss": 0.5339, + "mean_token_accuracy": 0.8311783075332642, + "num_tokens": 35557493.0, + "step": 934 + }, + { + "epoch": 0.11894161048212695, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 1.5946303606033325, + "learning_rate": 3.959304790165324e-07, + "loss": 0.5299, + "mean_token_accuracy": 0.8322923183441162, + "num_tokens": 35599692.0, + "step": 935 + }, + { + "epoch": 0.11906882076071747, + "ewc_loss": 2.9802322387695312e-06, + "grad_norm": 1.4496443271636963, + "learning_rate": 3.9635438745231025e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.847318172454834, + "num_tokens": 35644089.0, + "step": 936 + }, + { + "epoch": 0.11919603103930798, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.608788251876831, + "learning_rate": 3.9677829588808815e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8458433747291565, + "num_tokens": 35682823.0, + "step": 937 + }, + { + "epoch": 0.11932324131789848, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.8385697603225708, + "learning_rate": 3.9720220432386605e-07, + "loss": 0.5434, + "mean_token_accuracy": 0.8244297504425049, + "num_tokens": 35712997.0, + "step": 938 + }, + { + "epoch": 0.119450451596489, + "ewc_loss": 2.995133399963379e-06, + "grad_norm": 1.6164250373840332, + "learning_rate": 3.976261127596439e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.850269079208374, + "num_tokens": 35748506.0, + "step": 939 + }, + { + "epoch": 0.11957766187507951, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.6290886402130127, + "learning_rate": 3.9805002119542174e-07, + "loss": 0.448, + "mean_token_accuracy": 0.8569109439849854, + "num_tokens": 35786583.0, + "step": 940 + }, + { + "epoch": 0.11970487215367001, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.5360864400863647, + "learning_rate": 3.9847392963119964e-07, + "loss": 0.4745, + "mean_token_accuracy": 0.8463082313537598, + "num_tokens": 35828710.0, + "step": 941 + }, + { + "epoch": 0.11983208243226053, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.5514038801193237, + "learning_rate": 3.9889783806697754e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8345832824707031, + "num_tokens": 35870598.0, + "step": 942 + }, + { + "epoch": 0.11995929271085104, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.578054666519165, + "learning_rate": 3.993217465027554e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.852153480052948, + "num_tokens": 35907649.0, + "step": 943 + }, + { + "epoch": 0.12008650298944154, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.5959527492523193, + "learning_rate": 3.9974565493853323e-07, + "loss": 0.4532, + "mean_token_accuracy": 0.8537434339523315, + "num_tokens": 35947883.0, + "step": 944 + }, + { + "epoch": 0.12021371326803205, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.584696888923645, + "learning_rate": 4.0016956337431113e-07, + "loss": 0.4363, + "mean_token_accuracy": 0.8605902194976807, + "num_tokens": 35986016.0, + "step": 945 + }, + { + "epoch": 0.12034092354662257, + "ewc_loss": 3.0249357223510742e-06, + "grad_norm": 1.6589244604110718, + "learning_rate": 4.0059347181008903e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8461729884147644, + "num_tokens": 36021788.0, + "step": 946 + }, + { + "epoch": 0.12046813382521308, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 1.6221694946289062, + "learning_rate": 4.010173802458669e-07, + "loss": 0.45, + "mean_token_accuracy": 0.854132354259491, + "num_tokens": 36059982.0, + "step": 947 + }, + { + "epoch": 0.12059534410380358, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 1.6398940086364746, + "learning_rate": 4.014412886816447e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8441987633705139, + "num_tokens": 36099078.0, + "step": 948 + }, + { + "epoch": 0.1207225543823941, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 1.6537739038467407, + "learning_rate": 4.018651971174226e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8446193933486938, + "num_tokens": 36139014.0, + "step": 949 + }, + { + "epoch": 0.12084976466098461, + "ewc_loss": 3.039836883544922e-06, + "grad_norm": 1.682471513748169, + "learning_rate": 4.022891055532005e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8464441299438477, + "num_tokens": 36175819.0, + "step": 950 + }, + { + "epoch": 0.12097697493957511, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.671958088874817, + "learning_rate": 4.0271301398897837e-07, + "loss": 0.5252, + "mean_token_accuracy": 0.8301059007644653, + "num_tokens": 36216658.0, + "step": 951 + }, + { + "epoch": 0.12110418521816563, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.4957317113876343, + "learning_rate": 4.031369224247562e-07, + "loss": 0.3904, + "mean_token_accuracy": 0.872157871723175, + "num_tokens": 36256678.0, + "step": 952 + }, + { + "epoch": 0.12123139549675614, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.4615787267684937, + "learning_rate": 4.035608308605341e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8459657430648804, + "num_tokens": 36300676.0, + "step": 953 + }, + { + "epoch": 0.12135860577534664, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.676198959350586, + "learning_rate": 4.03984739296312e-07, + "loss": 0.4481, + "mean_token_accuracy": 0.8513083457946777, + "num_tokens": 36334489.0, + "step": 954 + }, + { + "epoch": 0.12148581605393716, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.5594719648361206, + "learning_rate": 4.044086477320898e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.841812252998352, + "num_tokens": 36376781.0, + "step": 955 + }, + { + "epoch": 0.12161302633252767, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.5614806413650513, + "learning_rate": 4.048325561678677e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8493335247039795, + "num_tokens": 36422177.0, + "step": 956 + }, + { + "epoch": 0.12174023661111817, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.5711283683776855, + "learning_rate": 4.052564646036456e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8568635582923889, + "num_tokens": 36460679.0, + "step": 957 + }, + { + "epoch": 0.12186744688970869, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.727915644645691, + "learning_rate": 4.056803730394235e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8417367935180664, + "num_tokens": 36495828.0, + "step": 958 + }, + { + "epoch": 0.1219946571682992, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.7367075681686401, + "learning_rate": 4.061042814752013e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8477165699005127, + "num_tokens": 36527734.0, + "step": 959 + }, + { + "epoch": 0.12212186744688971, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.5765957832336426, + "learning_rate": 4.065281899109792e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8401414752006531, + "num_tokens": 36566236.0, + "step": 960 + }, + { + "epoch": 0.12224907772548022, + "ewc_loss": 3.069639205932617e-06, + "grad_norm": 1.758156180381775, + "learning_rate": 4.069520983467571e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8541423082351685, + "num_tokens": 36598307.0, + "step": 961 + }, + { + "epoch": 0.12237628800407073, + "ewc_loss": 3.084540367126465e-06, + "grad_norm": 1.993290662765503, + "learning_rate": 4.07376006782535e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.822810173034668, + "num_tokens": 36632678.0, + "step": 962 + }, + { + "epoch": 0.12250349828266124, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.7214457988739014, + "learning_rate": 4.077999152183128e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8312393426895142, + "num_tokens": 36670034.0, + "step": 963 + }, + { + "epoch": 0.12263070856125174, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.6108556985855103, + "learning_rate": 4.082238236540907e-07, + "loss": 0.5218, + "mean_token_accuracy": 0.8335413932800293, + "num_tokens": 36709187.0, + "step": 964 + }, + { + "epoch": 0.12275791883984226, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.7079907655715942, + "learning_rate": 4.086477320898686e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.842793881893158, + "num_tokens": 36745507.0, + "step": 965 + }, + { + "epoch": 0.12288512911843277, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.702120304107666, + "learning_rate": 4.090716405256465e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8265591859817505, + "num_tokens": 36784944.0, + "step": 966 + }, + { + "epoch": 0.12301233939702327, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.4921648502349854, + "learning_rate": 4.094955489614243e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8462792038917542, + "num_tokens": 36825729.0, + "step": 967 + }, + { + "epoch": 0.12313954967561379, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 2.1164488792419434, + "learning_rate": 4.099194573972022e-07, + "loss": 0.5417, + "mean_token_accuracy": 0.825305700302124, + "num_tokens": 36854920.0, + "step": 968 + }, + { + "epoch": 0.1232667599542043, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.6486561298370361, + "learning_rate": 4.1034336583298007e-07, + "loss": 0.5216, + "mean_token_accuracy": 0.8357075452804565, + "num_tokens": 36892552.0, + "step": 969 + }, + { + "epoch": 0.1233939702327948, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.6048741340637207, + "learning_rate": 4.1076727426875797e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8502582311630249, + "num_tokens": 36929185.0, + "step": 970 + }, + { + "epoch": 0.12352118051138532, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.6655499935150146, + "learning_rate": 4.1119118270453577e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8436539173126221, + "num_tokens": 36964584.0, + "step": 971 + }, + { + "epoch": 0.12364839078997583, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.5708833932876587, + "learning_rate": 4.1161509114031366e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8463690876960754, + "num_tokens": 37005493.0, + "step": 972 + }, + { + "epoch": 0.12377560106856635, + "ewc_loss": 3.11434268951416e-06, + "grad_norm": 1.542332649230957, + "learning_rate": 4.1203899957609156e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8498976230621338, + "num_tokens": 37044885.0, + "step": 973 + }, + { + "epoch": 0.12390281134715685, + "ewc_loss": 3.129243850708008e-06, + "grad_norm": 2.1333391666412354, + "learning_rate": 4.124629080118694e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8552082180976868, + "num_tokens": 37076794.0, + "step": 974 + }, + { + "epoch": 0.12403002162574736, + "ewc_loss": 3.129243850708008e-06, + "grad_norm": 1.7215291261672974, + "learning_rate": 4.1288681644764726e-07, + "loss": 0.5268, + "mean_token_accuracy": 0.8278690576553345, + "num_tokens": 37114531.0, + "step": 975 + }, + { + "epoch": 0.12415723190433788, + "ewc_loss": 3.1441450119018555e-06, + "grad_norm": 1.6285712718963623, + "learning_rate": 4.1331072488342515e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8377474546432495, + "num_tokens": 37150963.0, + "step": 976 + }, + { + "epoch": 0.12428444218292838, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 1.5126389265060425, + "learning_rate": 4.1373463331920305e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8500586748123169, + "num_tokens": 37191466.0, + "step": 977 + }, + { + "epoch": 0.12441165246151889, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 1.4357260465621948, + "learning_rate": 4.141585417549809e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.840992271900177, + "num_tokens": 37233363.0, + "step": 978 + }, + { + "epoch": 0.1245388627401094, + "ewc_loss": 3.159046173095703e-06, + "grad_norm": 1.7865849733352661, + "learning_rate": 4.1458245019075875e-07, + "loss": 0.5414, + "mean_token_accuracy": 0.8265728950500488, + "num_tokens": 37268090.0, + "step": 979 + }, + { + "epoch": 0.1246660730186999, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.6104772090911865, + "learning_rate": 4.1500635862653664e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8519644737243652, + "num_tokens": 37307091.0, + "step": 980 + }, + { + "epoch": 0.12479328329729042, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.6550571918487549, + "learning_rate": 4.1543026706231454e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8309488892555237, + "num_tokens": 37346890.0, + "step": 981 + }, + { + "epoch": 0.12492049357588093, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.8065747022628784, + "learning_rate": 4.158541754980924e-07, + "loss": 0.5332, + "mean_token_accuracy": 0.8291510343551636, + "num_tokens": 37376709.0, + "step": 982 + }, + { + "epoch": 0.12504770385447145, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.5418840646743774, + "learning_rate": 4.1627808393387024e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8439066410064697, + "num_tokens": 37416180.0, + "step": 983 + }, + { + "epoch": 0.12517491413306195, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.5822803974151611, + "learning_rate": 4.1670199236964813e-07, + "loss": 0.4376, + "mean_token_accuracy": 0.8540828227996826, + "num_tokens": 37454911.0, + "step": 984 + }, + { + "epoch": 0.12530212441165245, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.6381629705429077, + "learning_rate": 4.1712590080542603e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8477353453636169, + "num_tokens": 37489316.0, + "step": 985 + }, + { + "epoch": 0.12542933469024298, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.8913387060165405, + "learning_rate": 4.175498092412039e-07, + "loss": 0.5053, + "mean_token_accuracy": 0.8353118896484375, + "num_tokens": 37520399.0, + "step": 986 + }, + { + "epoch": 0.12555654496883348, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.5581387281417847, + "learning_rate": 4.179737176769817e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.8397286534309387, + "num_tokens": 37561078.0, + "step": 987 + }, + { + "epoch": 0.12568375524742398, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.5709526538848877, + "learning_rate": 4.183976261127596e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8427309989929199, + "num_tokens": 37601554.0, + "step": 988 + }, + { + "epoch": 0.1258109655260145, + "ewc_loss": 3.1739473342895508e-06, + "grad_norm": 1.6000428199768066, + "learning_rate": 4.1882153454853747e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8475157618522644, + "num_tokens": 37638831.0, + "step": 989 + }, + { + "epoch": 0.125938175804605, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 1.579543113708496, + "learning_rate": 4.1924544298431537e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8353752493858337, + "num_tokens": 37680738.0, + "step": 990 + }, + { + "epoch": 0.12606538608319554, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 1.7378270626068115, + "learning_rate": 4.196693514200932e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8412936925888062, + "num_tokens": 37713831.0, + "step": 991 + }, + { + "epoch": 0.12619259636178604, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 1.758239984512329, + "learning_rate": 4.200932598558711e-07, + "loss": 0.5339, + "mean_token_accuracy": 0.8283786177635193, + "num_tokens": 37750902.0, + "step": 992 + }, + { + "epoch": 0.12631980664037654, + "ewc_loss": 3.203749656677246e-06, + "grad_norm": 1.6367261409759521, + "learning_rate": 4.2051716829164896e-07, + "loss": 0.4969, + "mean_token_accuracy": 0.8396064639091492, + "num_tokens": 37790206.0, + "step": 993 + }, + { + "epoch": 0.12644701691896706, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.718916416168213, + "learning_rate": 4.2094107672742686e-07, + "loss": 0.437, + "mean_token_accuracy": 0.8560964465141296, + "num_tokens": 37827182.0, + "step": 994 + }, + { + "epoch": 0.12657422719755757, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.7111560106277466, + "learning_rate": 4.2136498516320476e-07, + "loss": 0.5241, + "mean_token_accuracy": 0.832923173904419, + "num_tokens": 37864658.0, + "step": 995 + }, + { + "epoch": 0.12670143747614807, + "ewc_loss": 3.2186508178710938e-06, + "grad_norm": 1.4562101364135742, + "learning_rate": 4.217888935989826e-07, + "loss": 0.4332, + "mean_token_accuracy": 0.8582404851913452, + "num_tokens": 37907516.0, + "step": 996 + }, + { + "epoch": 0.1268286477547386, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 1.7254276275634766, + "learning_rate": 4.2221280203476045e-07, + "loss": 0.4322, + "mean_token_accuracy": 0.8597724437713623, + "num_tokens": 37942264.0, + "step": 997 + }, + { + "epoch": 0.1269558580333291, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 1.6779322624206543, + "learning_rate": 4.2263671047053835e-07, + "loss": 0.4461, + "mean_token_accuracy": 0.8525562286376953, + "num_tokens": 37975882.0, + "step": 998 + }, + { + "epoch": 0.1270830683119196, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 1.6599117517471313, + "learning_rate": 4.2306061890631625e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.84125816822052, + "num_tokens": 38012551.0, + "step": 999 + }, + { + "epoch": 0.12721027859051012, + "ewc_loss": 3.248453140258789e-06, + "grad_norm": 1.5604488849639893, + "learning_rate": 4.234845273420941e-07, + "loss": 0.5278, + "mean_token_accuracy": 0.8308385610580444, + "num_tokens": 38053322.0, + "step": 1000 + }, + { + "epoch": 0.12733748886910062, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.5257076025009155, + "learning_rate": 4.2390843577787194e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.8458464741706848, + "num_tokens": 38097126.0, + "step": 1001 + }, + { + "epoch": 0.12746469914769112, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.6190321445465088, + "learning_rate": 4.2433234421364984e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8539382219314575, + "num_tokens": 38135885.0, + "step": 1002 + }, + { + "epoch": 0.12759190942628165, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.7011264562606812, + "learning_rate": 4.2475625264942774e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.84061199426651, + "num_tokens": 38172552.0, + "step": 1003 + }, + { + "epoch": 0.12771911970487215, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.7411378622055054, + "learning_rate": 4.251801610852056e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8373012542724609, + "num_tokens": 38208794.0, + "step": 1004 + }, + { + "epoch": 0.12784632998346265, + "ewc_loss": 3.2633543014526367e-06, + "grad_norm": 1.655163288116455, + "learning_rate": 4.2560406952098343e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.8286676406860352, + "num_tokens": 38247920.0, + "step": 1005 + }, + { + "epoch": 0.12797354026205318, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.5266027450561523, + "learning_rate": 4.2602797795676133e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.852064847946167, + "num_tokens": 38287508.0, + "step": 1006 + }, + { + "epoch": 0.12810075054064368, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.7902421951293945, + "learning_rate": 4.2645188639253923e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8386139869689941, + "num_tokens": 38320305.0, + "step": 1007 + }, + { + "epoch": 0.12822796081923418, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.787634015083313, + "learning_rate": 4.26875794828317e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8567829728126526, + "num_tokens": 38352442.0, + "step": 1008 + }, + { + "epoch": 0.1283551710978247, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.4770008325576782, + "learning_rate": 4.272997032640949e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8532384634017944, + "num_tokens": 38396086.0, + "step": 1009 + }, + { + "epoch": 0.1284823813764152, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.6210227012634277, + "learning_rate": 4.277236116998728e-07, + "loss": 0.4919, + "mean_token_accuracy": 0.8406111001968384, + "num_tokens": 38432889.0, + "step": 1010 + }, + { + "epoch": 0.1286095916550057, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.7710909843444824, + "learning_rate": 4.281475201356507e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8408995866775513, + "num_tokens": 38471681.0, + "step": 1011 + }, + { + "epoch": 0.12873680193359624, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 2.114534854888916, + "learning_rate": 4.285714285714285e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.8535164594650269, + "num_tokens": 38505145.0, + "step": 1012 + }, + { + "epoch": 0.12886401221218674, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.5999847650527954, + "learning_rate": 4.289953370072064e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8541368246078491, + "num_tokens": 38540012.0, + "step": 1013 + }, + { + "epoch": 0.12899122249077727, + "ewc_loss": 3.2782554626464844e-06, + "grad_norm": 1.6723555326461792, + "learning_rate": 4.294192454429843e-07, + "loss": 0.5323, + "mean_token_accuracy": 0.8239604234695435, + "num_tokens": 38581861.0, + "step": 1014 + }, + { + "epoch": 0.12911843276936777, + "ewc_loss": 3.293156623840332e-06, + "grad_norm": 1.5650683641433716, + "learning_rate": 4.298431538787622e-07, + "loss": 0.5464, + "mean_token_accuracy": 0.8263059258460999, + "num_tokens": 38630809.0, + "step": 1015 + }, + { + "epoch": 0.12924564304795827, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.6746578216552734, + "learning_rate": 4.3026706231454e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8488811254501343, + "num_tokens": 38663639.0, + "step": 1016 + }, + { + "epoch": 0.1293728533265488, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.7048237323760986, + "learning_rate": 4.306909707503179e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8495194911956787, + "num_tokens": 38697680.0, + "step": 1017 + }, + { + "epoch": 0.1295000636051393, + "ewc_loss": 3.3080577850341797e-06, + "grad_norm": 1.51897394657135, + "learning_rate": 4.311148791860958e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8496996164321899, + "num_tokens": 38736429.0, + "step": 1018 + }, + { + "epoch": 0.1296272738837298, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.6894352436065674, + "learning_rate": 4.315387876218737e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8304776549339294, + "num_tokens": 38773634.0, + "step": 1019 + }, + { + "epoch": 0.12975448416232033, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.6449334621429443, + "learning_rate": 4.319626960576515e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8397849202156067, + "num_tokens": 38811897.0, + "step": 1020 + }, + { + "epoch": 0.12988169444091083, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.604796290397644, + "learning_rate": 4.323866044934294e-07, + "loss": 0.5129, + "mean_token_accuracy": 0.8314265012741089, + "num_tokens": 38849567.0, + "step": 1021 + }, + { + "epoch": 0.13000890471950133, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.7415695190429688, + "learning_rate": 4.328105129292073e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.831445574760437, + "num_tokens": 38885428.0, + "step": 1022 + }, + { + "epoch": 0.13013611499809186, + "ewc_loss": 3.337860107421875e-06, + "grad_norm": 1.5392062664031982, + "learning_rate": 4.332344213649852e-07, + "loss": 0.4162, + "mean_token_accuracy": 0.8620814085006714, + "num_tokens": 38927537.0, + "step": 1023 + }, + { + "epoch": 0.13026332527668236, + "ewc_loss": 3.3229589462280273e-06, + "grad_norm": 1.8403903245925903, + "learning_rate": 4.33658329800763e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8476190567016602, + "num_tokens": 38971851.0, + "step": 1024 + }, + { + "epoch": 0.13039053555527286, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.6639715433120728, + "learning_rate": 4.340822382365409e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8333652019500732, + "num_tokens": 39007367.0, + "step": 1025 + }, + { + "epoch": 0.13051774583386339, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.691442847251892, + "learning_rate": 4.345061466723188e-07, + "loss": 0.5413, + "mean_token_accuracy": 0.8243124485015869, + "num_tokens": 39044944.0, + "step": 1026 + }, + { + "epoch": 0.13064495611245389, + "ewc_loss": 3.3527612686157227e-06, + "grad_norm": 1.762606143951416, + "learning_rate": 4.3493005510809663e-07, + "loss": 0.4836, + "mean_token_accuracy": 0.8420398831367493, + "num_tokens": 39080635.0, + "step": 1027 + }, + { + "epoch": 0.1307721663910444, + "ewc_loss": 3.3676624298095703e-06, + "grad_norm": 1.9547051191329956, + "learning_rate": 4.353539635438745e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8234038949012756, + "num_tokens": 39114671.0, + "step": 1028 + }, + { + "epoch": 0.13089937666963491, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.7064708471298218, + "learning_rate": 4.357778719796524e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8374930620193481, + "num_tokens": 39149590.0, + "step": 1029 + }, + { + "epoch": 0.13102658694822542, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.5727710723876953, + "learning_rate": 4.362017804154303e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.8444730043411255, + "num_tokens": 39187414.0, + "step": 1030 + }, + { + "epoch": 0.13115379722681592, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.6870793104171753, + "learning_rate": 4.366256888512081e-07, + "loss": 0.465, + "mean_token_accuracy": 0.8469880819320679, + "num_tokens": 39221936.0, + "step": 1031 + }, + { + "epoch": 0.13128100750540644, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.688370704650879, + "learning_rate": 4.3704959728698597e-07, + "loss": 0.4301, + "mean_token_accuracy": 0.8573307394981384, + "num_tokens": 39259463.0, + "step": 1032 + }, + { + "epoch": 0.13140821778399694, + "ewc_loss": 3.3974647521972656e-06, + "grad_norm": 1.7789316177368164, + "learning_rate": 4.3747350572276386e-07, + "loss": 0.5457, + "mean_token_accuracy": 0.8291507959365845, + "num_tokens": 39297566.0, + "step": 1033 + }, + { + "epoch": 0.13153542806258745, + "ewc_loss": 3.427267074584961e-06, + "grad_norm": 1.60184645652771, + "learning_rate": 4.3789741415854176e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8583219647407532, + "num_tokens": 39332480.0, + "step": 1034 + }, + { + "epoch": 0.13166263834117797, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.6027988195419312, + "learning_rate": 4.383213225943196e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8442298769950867, + "num_tokens": 39373646.0, + "step": 1035 + }, + { + "epoch": 0.13178984861976847, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.4860843420028687, + "learning_rate": 4.3874523103009746e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8487169742584229, + "num_tokens": 39415915.0, + "step": 1036 + }, + { + "epoch": 0.13191705889835897, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 1.638954520225525, + "learning_rate": 4.3916913946587536e-07, + "loss": 0.5995, + "mean_token_accuracy": 0.8128172755241394, + "num_tokens": 39455805.0, + "step": 1037 + }, + { + "epoch": 0.1320442691769495, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 1.6906551122665405, + "learning_rate": 4.3959304790165325e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8395966291427612, + "num_tokens": 39493665.0, + "step": 1038 + }, + { + "epoch": 0.13217147945554, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.5703380107879639, + "learning_rate": 4.400169563374311e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.8384239077568054, + "num_tokens": 39535957.0, + "step": 1039 + }, + { + "epoch": 0.13229868973413053, + "ewc_loss": 3.4421682357788086e-06, + "grad_norm": 1.477908730506897, + "learning_rate": 4.4044086477320895e-07, + "loss": 0.433, + "mean_token_accuracy": 0.856669545173645, + "num_tokens": 39578683.0, + "step": 1040 + }, + { + "epoch": 0.13242590001272103, + "ewc_loss": 3.471970558166504e-06, + "grad_norm": 1.6154309511184692, + "learning_rate": 4.4086477320898685e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8475369215011597, + "num_tokens": 39616096.0, + "step": 1041 + }, + { + "epoch": 0.13255311029131153, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.6597245931625366, + "learning_rate": 4.4128868164476474e-07, + "loss": 0.5252, + "mean_token_accuracy": 0.828009843826294, + "num_tokens": 39656589.0, + "step": 1042 + }, + { + "epoch": 0.13268032056990206, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.667869210243225, + "learning_rate": 4.417125900805426e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8301033973693848, + "num_tokens": 39694490.0, + "step": 1043 + }, + { + "epoch": 0.13280753084849256, + "ewc_loss": 3.4570693969726562e-06, + "grad_norm": 1.5868841409683228, + "learning_rate": 4.4213649851632044e-07, + "loss": 0.4323, + "mean_token_accuracy": 0.8588498830795288, + "num_tokens": 39733909.0, + "step": 1044 + }, + { + "epoch": 0.13293474112708306, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.549312710762024, + "learning_rate": 4.4256040695209834e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8539877533912659, + "num_tokens": 39772948.0, + "step": 1045 + }, + { + "epoch": 0.1330619514056736, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.6050142049789429, + "learning_rate": 4.429843153878762e-07, + "loss": 0.5051, + "mean_token_accuracy": 0.8344762325286865, + "num_tokens": 39816973.0, + "step": 1046 + }, + { + "epoch": 0.1331891616842641, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.4592198133468628, + "learning_rate": 4.434082238236541e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8522403240203857, + "num_tokens": 39859615.0, + "step": 1047 + }, + { + "epoch": 0.1333163719628546, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.5086275339126587, + "learning_rate": 4.4383213225943193e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8364052176475525, + "num_tokens": 39904131.0, + "step": 1048 + }, + { + "epoch": 0.13344358224144512, + "ewc_loss": 3.4868717193603516e-06, + "grad_norm": 1.6778755187988281, + "learning_rate": 4.442560406952098e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8372870683670044, + "num_tokens": 39940938.0, + "step": 1049 + }, + { + "epoch": 0.13357079252003562, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 1.7475775480270386, + "learning_rate": 4.4467994913098767e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.83680260181427, + "num_tokens": 39973646.0, + "step": 1050 + }, + { + "epoch": 0.13369800279862612, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 1.6302129030227661, + "learning_rate": 4.4510385756676557e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8360869884490967, + "num_tokens": 40016130.0, + "step": 1051 + }, + { + "epoch": 0.13382521307721665, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.5604952573776245, + "learning_rate": 4.455277660025434e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.8514917492866516, + "num_tokens": 40054209.0, + "step": 1052 + }, + { + "epoch": 0.13395242335580715, + "ewc_loss": 3.5017728805541992e-06, + "grad_norm": 1.5020216703414917, + "learning_rate": 4.459516744383213e-07, + "loss": 0.5268, + "mean_token_accuracy": 0.824836254119873, + "num_tokens": 40096724.0, + "step": 1053 + }, + { + "epoch": 0.13407963363439765, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.6503316164016724, + "learning_rate": 4.4637558287409916e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8451007604598999, + "num_tokens": 40134397.0, + "step": 1054 + }, + { + "epoch": 0.13420684391298818, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.6083365678787231, + "learning_rate": 4.4679949130987706e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8361964225769043, + "num_tokens": 40176893.0, + "step": 1055 + }, + { + "epoch": 0.13433405419157868, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.6509206295013428, + "learning_rate": 4.472233997456549e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8518558740615845, + "num_tokens": 40213880.0, + "step": 1056 + }, + { + "epoch": 0.13446126447016918, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.5730427503585815, + "learning_rate": 4.476473081814328e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.84004145860672, + "num_tokens": 40258039.0, + "step": 1057 + }, + { + "epoch": 0.1345884747487597, + "ewc_loss": 3.516674041748047e-06, + "grad_norm": 1.7592079639434814, + "learning_rate": 4.4807121661721065e-07, + "loss": 0.4768, + "mean_token_accuracy": 0.8425530195236206, + "num_tokens": 40294277.0, + "step": 1058 + }, + { + "epoch": 0.1347156850273502, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.593881607055664, + "learning_rate": 4.4849512505298855e-07, + "loss": 0.525, + "mean_token_accuracy": 0.83139967918396, + "num_tokens": 40338335.0, + "step": 1059 + }, + { + "epoch": 0.1348428953059407, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.6597964763641357, + "learning_rate": 4.489190334887664e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.853119432926178, + "num_tokens": 40378524.0, + "step": 1060 + }, + { + "epoch": 0.13497010558453124, + "ewc_loss": 3.5315752029418945e-06, + "grad_norm": 1.6593928337097168, + "learning_rate": 4.493429419245443e-07, + "loss": 0.4319, + "mean_token_accuracy": 0.8555742502212524, + "num_tokens": 40414872.0, + "step": 1061 + }, + { + "epoch": 0.13509731586312174, + "ewc_loss": 3.546476364135742e-06, + "grad_norm": 1.6278581619262695, + "learning_rate": 4.4976685036032214e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8467621803283691, + "num_tokens": 40453520.0, + "step": 1062 + }, + { + "epoch": 0.13522452614171224, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.4752819538116455, + "learning_rate": 4.5019075879610004e-07, + "loss": 0.4669, + "mean_token_accuracy": 0.8495540618896484, + "num_tokens": 40500126.0, + "step": 1063 + }, + { + "epoch": 0.13535173642030277, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.4746501445770264, + "learning_rate": 4.506146672318779e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8539749979972839, + "num_tokens": 40545105.0, + "step": 1064 + }, + { + "epoch": 0.13547894669889327, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.681691288948059, + "learning_rate": 4.5103857566765573e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8488624691963196, + "num_tokens": 40579618.0, + "step": 1065 + }, + { + "epoch": 0.1356061569774838, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.5732905864715576, + "learning_rate": 4.5146248410343363e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.8506780862808228, + "num_tokens": 40618004.0, + "step": 1066 + }, + { + "epoch": 0.1357333672560743, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.7782896757125854, + "learning_rate": 4.5188639253921153e-07, + "loss": 0.4972, + "mean_token_accuracy": 0.8398967981338501, + "num_tokens": 40652814.0, + "step": 1067 + }, + { + "epoch": 0.1358605775346648, + "ewc_loss": 3.5762786865234375e-06, + "grad_norm": 1.9337795972824097, + "learning_rate": 4.523103009749894e-07, + "loss": 0.4202, + "mean_token_accuracy": 0.858398973941803, + "num_tokens": 40682719.0, + "step": 1068 + }, + { + "epoch": 0.13598778781325532, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.5318939685821533, + "learning_rate": 4.527342094107672e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8438963890075684, + "num_tokens": 40727190.0, + "step": 1069 + }, + { + "epoch": 0.13611499809184582, + "ewc_loss": 3.56137752532959e-06, + "grad_norm": 1.7766170501708984, + "learning_rate": 4.531581178465451e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8324658870697021, + "num_tokens": 40761696.0, + "step": 1070 + }, + { + "epoch": 0.13624220837043632, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.7646780014038086, + "learning_rate": 4.53582026282323e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.8208814263343811, + "num_tokens": 40796984.0, + "step": 1071 + }, + { + "epoch": 0.13636941864902685, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.6019134521484375, + "learning_rate": 4.5400593471810087e-07, + "loss": 0.4777, + "mean_token_accuracy": 0.846114993095398, + "num_tokens": 40833669.0, + "step": 1072 + }, + { + "epoch": 0.13649662892761735, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.894752025604248, + "learning_rate": 4.544298431538787e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8408617973327637, + "num_tokens": 40867648.0, + "step": 1073 + }, + { + "epoch": 0.13662383920620785, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.68585205078125, + "learning_rate": 4.548537515896566e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.852177083492279, + "num_tokens": 40901259.0, + "step": 1074 + }, + { + "epoch": 0.13675104948479838, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.6447495222091675, + "learning_rate": 4.552776600254345e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8547242879867554, + "num_tokens": 40940076.0, + "step": 1075 + }, + { + "epoch": 0.13687825976338888, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.832760214805603, + "learning_rate": 4.5570156846121236e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8469992280006409, + "num_tokens": 40971585.0, + "step": 1076 + }, + { + "epoch": 0.13700547004197938, + "ewc_loss": 3.591179847717285e-06, + "grad_norm": 1.5308878421783447, + "learning_rate": 4.561254768969902e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.851343035697937, + "num_tokens": 41011450.0, + "step": 1077 + }, + { + "epoch": 0.1371326803205699, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.5976483821868896, + "learning_rate": 4.565493853327681e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8391113877296448, + "num_tokens": 41049488.0, + "step": 1078 + }, + { + "epoch": 0.1372598905991604, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.6266435384750366, + "learning_rate": 4.56973293768546e-07, + "loss": 0.4758, + "mean_token_accuracy": 0.844343900680542, + "num_tokens": 41085211.0, + "step": 1079 + }, + { + "epoch": 0.1373871008777509, + "ewc_loss": 3.606081008911133e-06, + "grad_norm": 1.499673843383789, + "learning_rate": 4.573972022043238e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8447243571281433, + "num_tokens": 41129923.0, + "step": 1080 + }, + { + "epoch": 0.13751431115634144, + "ewc_loss": 3.6209821701049805e-06, + "grad_norm": 1.666867971420288, + "learning_rate": 4.578211106401017e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8381202220916748, + "num_tokens": 41165851.0, + "step": 1081 + }, + { + "epoch": 0.13764152143493194, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.7724817991256714, + "learning_rate": 4.582450190758796e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.8479005098342896, + "num_tokens": 41197131.0, + "step": 1082 + }, + { + "epoch": 0.13776873171352244, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.539621114730835, + "learning_rate": 4.586689275116575e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8487116098403931, + "num_tokens": 41241512.0, + "step": 1083 + }, + { + "epoch": 0.13789594199211297, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.5308115482330322, + "learning_rate": 4.590928359474353e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8302152752876282, + "num_tokens": 41287199.0, + "step": 1084 + }, + { + "epoch": 0.13802315227070347, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.611621618270874, + "learning_rate": 4.595167443832132e-07, + "loss": 0.5225, + "mean_token_accuracy": 0.8317059874534607, + "num_tokens": 41326548.0, + "step": 1085 + }, + { + "epoch": 0.13815036254929397, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.7645624876022339, + "learning_rate": 4.599406528189911e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8353450894355774, + "num_tokens": 41360211.0, + "step": 1086 + }, + { + "epoch": 0.1382775728278845, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.5590673685073853, + "learning_rate": 4.60364561254769e-07, + "loss": 0.5317, + "mean_token_accuracy": 0.8274068832397461, + "num_tokens": 41405212.0, + "step": 1087 + }, + { + "epoch": 0.138404783106475, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.7147506475448608, + "learning_rate": 4.607884696905468e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8366752862930298, + "num_tokens": 41441818.0, + "step": 1088 + }, + { + "epoch": 0.1385319933850655, + "ewc_loss": 3.635883331298828e-06, + "grad_norm": 1.702155351638794, + "learning_rate": 4.612123781263247e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.8574328422546387, + "num_tokens": 41472756.0, + "step": 1089 + }, + { + "epoch": 0.13865920366365603, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.5542899370193481, + "learning_rate": 4.616362865621026e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8427249193191528, + "num_tokens": 41517243.0, + "step": 1090 + }, + { + "epoch": 0.13878641394224653, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.5990779399871826, + "learning_rate": 4.620601949978805e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8507713079452515, + "num_tokens": 41556223.0, + "step": 1091 + }, + { + "epoch": 0.13891362422083706, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.6250433921813965, + "learning_rate": 4.6248410343365827e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8419824242591858, + "num_tokens": 41596729.0, + "step": 1092 + }, + { + "epoch": 0.13904083449942756, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.5679962635040283, + "learning_rate": 4.6290801186943617e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8423503041267395, + "num_tokens": 41633390.0, + "step": 1093 + }, + { + "epoch": 0.13916804477801806, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.4684010744094849, + "learning_rate": 4.6333192030521407e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.84519362449646, + "num_tokens": 41677197.0, + "step": 1094 + }, + { + "epoch": 0.13929525505660859, + "ewc_loss": 3.6507844924926758e-06, + "grad_norm": 1.8238435983657837, + "learning_rate": 4.6375582874099196e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8363837003707886, + "num_tokens": 41712101.0, + "step": 1095 + }, + { + "epoch": 0.1394224653351991, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.6284966468811035, + "learning_rate": 4.6417973717676976e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8492138385772705, + "num_tokens": 41753967.0, + "step": 1096 + }, + { + "epoch": 0.1395496756137896, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.6353528499603271, + "learning_rate": 4.6460364561254766e-07, + "loss": 0.393, + "mean_token_accuracy": 0.8722856640815735, + "num_tokens": 41789502.0, + "step": 1097 + }, + { + "epoch": 0.13967688589238011, + "ewc_loss": 3.6656856536865234e-06, + "grad_norm": 1.6243079900741577, + "learning_rate": 4.6502755404832556e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8441708087921143, + "num_tokens": 41830631.0, + "step": 1098 + }, + { + "epoch": 0.13980409617097062, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.5181306600570679, + "learning_rate": 4.654514624841034e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8448337912559509, + "num_tokens": 41872650.0, + "step": 1099 + }, + { + "epoch": 0.13993130644956112, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.6920533180236816, + "learning_rate": 4.6587537091988125e-07, + "loss": 0.5501, + "mean_token_accuracy": 0.8207859396934509, + "num_tokens": 41911326.0, + "step": 1100 + }, + { + "epoch": 0.14005851672815164, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.6083983182907104, + "learning_rate": 4.6629927935565915e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8411859273910522, + "num_tokens": 41949775.0, + "step": 1101 + }, + { + "epoch": 0.14018572700674214, + "ewc_loss": 3.680586814880371e-06, + "grad_norm": 1.6143630743026733, + "learning_rate": 4.6672318779143705e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.839874267578125, + "num_tokens": 41989179.0, + "step": 1102 + }, + { + "epoch": 0.14031293728533265, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 1.5237162113189697, + "learning_rate": 4.671470962272149e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8401803970336914, + "num_tokens": 42033509.0, + "step": 1103 + }, + { + "epoch": 0.14044014756392317, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 1.7956534624099731, + "learning_rate": 4.6757100466299274e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8413894176483154, + "num_tokens": 42067005.0, + "step": 1104 + }, + { + "epoch": 0.14056735784251367, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 1.8131067752838135, + "learning_rate": 4.6799491309877064e-07, + "loss": 0.426, + "mean_token_accuracy": 0.8635004758834839, + "num_tokens": 42102078.0, + "step": 1105 + }, + { + "epoch": 0.14069456812110417, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 1.5803298950195312, + "learning_rate": 4.6841882153454854e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8471357822418213, + "num_tokens": 42142192.0, + "step": 1106 + }, + { + "epoch": 0.1408217783996947, + "ewc_loss": 3.6954879760742188e-06, + "grad_norm": 1.587598443031311, + "learning_rate": 4.688427299703264e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8518809080123901, + "num_tokens": 42181062.0, + "step": 1107 + }, + { + "epoch": 0.1409489886782852, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.6382750272750854, + "learning_rate": 4.6926663840610423e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8393064737319946, + "num_tokens": 42219559.0, + "step": 1108 + }, + { + "epoch": 0.1410761989568757, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.6849640607833862, + "learning_rate": 4.6969054684188213e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8452778458595276, + "num_tokens": 42255786.0, + "step": 1109 + }, + { + "epoch": 0.14120340923546623, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.6786454916000366, + "learning_rate": 4.7011445527766003e-07, + "loss": 0.4578, + "mean_token_accuracy": 0.8484723567962646, + "num_tokens": 42290306.0, + "step": 1110 + }, + { + "epoch": 0.14133061951405673, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.586594820022583, + "learning_rate": 4.7053836371343787e-07, + "loss": 0.4641, + "mean_token_accuracy": 0.8500994443893433, + "num_tokens": 42325610.0, + "step": 1111 + }, + { + "epoch": 0.14145782979264723, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.4159481525421143, + "learning_rate": 4.709622721492157e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8498587608337402, + "num_tokens": 42369792.0, + "step": 1112 + }, + { + "epoch": 0.14158504007123776, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.5525254011154175, + "learning_rate": 4.713861805849936e-07, + "loss": 0.4634, + "mean_token_accuracy": 0.8483641147613525, + "num_tokens": 42410480.0, + "step": 1113 + }, + { + "epoch": 0.14171225034982826, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.5806761980056763, + "learning_rate": 4.718100890207715e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8410199284553528, + "num_tokens": 42447403.0, + "step": 1114 + }, + { + "epoch": 0.1418394606284188, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.683395504951477, + "learning_rate": 4.7223399745654936e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8574703931808472, + "num_tokens": 42483319.0, + "step": 1115 + }, + { + "epoch": 0.1419666709070093, + "ewc_loss": 3.7103891372680664e-06, + "grad_norm": 1.5482885837554932, + "learning_rate": 4.726579058923272e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8530291318893433, + "num_tokens": 42524520.0, + "step": 1116 + }, + { + "epoch": 0.1420938811855998, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.6288684606552124, + "learning_rate": 4.730818143281051e-07, + "loss": 0.3895, + "mean_token_accuracy": 0.8710423707962036, + "num_tokens": 42560671.0, + "step": 1117 + }, + { + "epoch": 0.14222109146419032, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.9557477235794067, + "learning_rate": 4.7350572276388295e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8329528570175171, + "num_tokens": 42594431.0, + "step": 1118 + }, + { + "epoch": 0.14234830174278082, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.571293592453003, + "learning_rate": 4.7392963119966085e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8524075746536255, + "num_tokens": 42634145.0, + "step": 1119 + }, + { + "epoch": 0.14247551202137132, + "ewc_loss": 3.725290298461914e-06, + "grad_norm": 1.5727698802947998, + "learning_rate": 4.7435353963543875e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8407123684883118, + "num_tokens": 42675449.0, + "step": 1120 + }, + { + "epoch": 0.14260272229996185, + "ewc_loss": 3.7550926208496094e-06, + "grad_norm": 1.7009990215301514, + "learning_rate": 4.747774480712166e-07, + "loss": 0.4253, + "mean_token_accuracy": 0.8580096960067749, + "num_tokens": 42707613.0, + "step": 1121 + }, + { + "epoch": 0.14272993257855235, + "ewc_loss": 3.7848949432373047e-06, + "grad_norm": 1.7272769212722778, + "learning_rate": 4.7520135650699444e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8457943797111511, + "num_tokens": 42741758.0, + "step": 1122 + }, + { + "epoch": 0.14285714285714285, + "ewc_loss": 3.7848949432373047e-06, + "grad_norm": 1.7474043369293213, + "learning_rate": 4.7562526494277234e-07, + "loss": 0.4831, + "mean_token_accuracy": 0.8416100740432739, + "num_tokens": 42778732.0, + "step": 1123 + }, + { + "epoch": 0.14298435313573338, + "ewc_loss": 3.7848949432373047e-06, + "grad_norm": 1.8170620203018188, + "learning_rate": 4.7604917337855024e-07, + "loss": 0.5198, + "mean_token_accuracy": 0.831895112991333, + "num_tokens": 42818546.0, + "step": 1124 + }, + { + "epoch": 0.14311156341432388, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.653044581413269, + "learning_rate": 4.764730818143281e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8530514240264893, + "num_tokens": 42855312.0, + "step": 1125 + }, + { + "epoch": 0.14323877369291438, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.6680831909179688, + "learning_rate": 4.768969902501059e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.8429765701293945, + "num_tokens": 42891016.0, + "step": 1126 + }, + { + "epoch": 0.1433659839715049, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.935537338256836, + "learning_rate": 4.773208986858838e-07, + "loss": 0.554, + "mean_token_accuracy": 0.8211879134178162, + "num_tokens": 42923292.0, + "step": 1127 + }, + { + "epoch": 0.1434931942500954, + "ewc_loss": 3.7997961044311523e-06, + "grad_norm": 1.640941858291626, + "learning_rate": 4.777448071216617e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8457027673721313, + "num_tokens": 42959523.0, + "step": 1128 + }, + { + "epoch": 0.1436204045286859, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 1.7070317268371582, + "learning_rate": 4.781687155574396e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8301093578338623, + "num_tokens": 42999832.0, + "step": 1129 + }, + { + "epoch": 0.14374761480727644, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 1.7102619409561157, + "learning_rate": 4.785926239932175e-07, + "loss": 0.492, + "mean_token_accuracy": 0.8380277156829834, + "num_tokens": 43036231.0, + "step": 1130 + }, + { + "epoch": 0.14387482508586694, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 1.7103850841522217, + "learning_rate": 4.790165324289953e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.83796626329422, + "num_tokens": 43073455.0, + "step": 1131 + }, + { + "epoch": 0.14400203536445744, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 1.7527785301208496, + "learning_rate": 4.794404408647732e-07, + "loss": 0.4272, + "mean_token_accuracy": 0.8593869805335999, + "num_tokens": 43105132.0, + "step": 1132 + }, + { + "epoch": 0.14412924564304797, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 1.7418709993362427, + "learning_rate": 4.798643493005511e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.8643760085105896, + "num_tokens": 43137977.0, + "step": 1133 + }, + { + "epoch": 0.14425645592163847, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 1.6572920083999634, + "learning_rate": 4.80288257736329e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8451589345932007, + "num_tokens": 43180102.0, + "step": 1134 + }, + { + "epoch": 0.14438366620022897, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 1.585272192955017, + "learning_rate": 4.807121661721068e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8481082916259766, + "num_tokens": 43221345.0, + "step": 1135 + }, + { + "epoch": 0.1445108764788195, + "ewc_loss": 3.814697265625e-06, + "grad_norm": 1.7529160976409912, + "learning_rate": 4.811360746078847e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8498682975769043, + "num_tokens": 43257359.0, + "step": 1136 + }, + { + "epoch": 0.14463808675741, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.7062914371490479, + "learning_rate": 4.815599830436625e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8328436613082886, + "num_tokens": 43295790.0, + "step": 1137 + }, + { + "epoch": 0.1447652970360005, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.4588613510131836, + "learning_rate": 4.819838914794405e-07, + "loss": 0.4082, + "mean_token_accuracy": 0.8642786741256714, + "num_tokens": 43336522.0, + "step": 1138 + }, + { + "epoch": 0.14489250731459102, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.5254954099655151, + "learning_rate": 4.824077999152183e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8414514064788818, + "num_tokens": 43381943.0, + "step": 1139 + }, + { + "epoch": 0.14501971759318152, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.6447197198867798, + "learning_rate": 4.828317083509962e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8381069302558899, + "num_tokens": 43418173.0, + "step": 1140 + }, + { + "epoch": 0.14514692787177205, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.754563570022583, + "learning_rate": 4.83255616786774e-07, + "loss": 0.6121, + "mean_token_accuracy": 0.8054741024971008, + "num_tokens": 43456925.0, + "step": 1141 + }, + { + "epoch": 0.14527413815036255, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.444326639175415, + "learning_rate": 4.83679525222552e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8416178822517395, + "num_tokens": 43502212.0, + "step": 1142 + }, + { + "epoch": 0.14540134842895305, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.5308641195297241, + "learning_rate": 4.841034336583298e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8433356881141663, + "num_tokens": 43543815.0, + "step": 1143 + }, + { + "epoch": 0.14552855870754358, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.7019084692001343, + "learning_rate": 4.845273420941076e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.8318531513214111, + "num_tokens": 43580168.0, + "step": 1144 + }, + { + "epoch": 0.14565576898613408, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.481062650680542, + "learning_rate": 4.849512505298855e-07, + "loss": 0.4223, + "mean_token_accuracy": 0.8610672354698181, + "num_tokens": 43620674.0, + "step": 1145 + }, + { + "epoch": 0.14578297926472458, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.736997365951538, + "learning_rate": 4.853751589656634e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8523849844932556, + "num_tokens": 43654847.0, + "step": 1146 + }, + { + "epoch": 0.1459101895433151, + "ewc_loss": 3.844499588012695e-06, + "grad_norm": 1.5968327522277832, + "learning_rate": 4.857990674014413e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8544928431510925, + "num_tokens": 43693980.0, + "step": 1147 + }, + { + "epoch": 0.1460373998219056, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.8590319156646729, + "learning_rate": 4.862229758372191e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8426355123519897, + "num_tokens": 43725405.0, + "step": 1148 + }, + { + "epoch": 0.1461646101004961, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.677501916885376, + "learning_rate": 4.86646884272997e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.85270094871521, + "num_tokens": 43763069.0, + "step": 1149 + }, + { + "epoch": 0.14629182037908664, + "ewc_loss": 3.874301910400391e-06, + "grad_norm": 1.7247941493988037, + "learning_rate": 4.870707927087749e-07, + "loss": 0.4242, + "mean_token_accuracy": 0.8597814440727234, + "num_tokens": 43795320.0, + "step": 1150 + }, + { + "epoch": 0.14641903065767714, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.519627571105957, + "learning_rate": 4.874947011445528e-07, + "loss": 0.4379, + "mean_token_accuracy": 0.8561159372329712, + "num_tokens": 43835306.0, + "step": 1151 + }, + { + "epoch": 0.14654624093626764, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.4737221002578735, + "learning_rate": 4.879186095803306e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.8433003425598145, + "num_tokens": 43877798.0, + "step": 1152 + }, + { + "epoch": 0.14667345121485817, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.571526288986206, + "learning_rate": 4.883425180161085e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8489245176315308, + "num_tokens": 43914927.0, + "step": 1153 + }, + { + "epoch": 0.14680066149344867, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.6365758180618286, + "learning_rate": 4.887664264518864e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8461087942123413, + "num_tokens": 43953475.0, + "step": 1154 + }, + { + "epoch": 0.14692787177203917, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.5736361742019653, + "learning_rate": 4.891903348876643e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8462806940078735, + "num_tokens": 43991506.0, + "step": 1155 + }, + { + "epoch": 0.1470550820506297, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.4936336278915405, + "learning_rate": 4.896142433234421e-07, + "loss": 0.442, + "mean_token_accuracy": 0.8501096963882446, + "num_tokens": 44028821.0, + "step": 1156 + }, + { + "epoch": 0.1471822923292202, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.6414326429367065, + "learning_rate": 4.9003815175922e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8486365079879761, + "num_tokens": 44063497.0, + "step": 1157 + }, + { + "epoch": 0.1473095026078107, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.5725237131118774, + "learning_rate": 4.904620601949979e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8262684345245361, + "num_tokens": 44104772.0, + "step": 1158 + }, + { + "epoch": 0.14743671288640123, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.5644071102142334, + "learning_rate": 4.908859686307758e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8467599153518677, + "num_tokens": 44144138.0, + "step": 1159 + }, + { + "epoch": 0.14756392316499173, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.5700470209121704, + "learning_rate": 4.913098770665536e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8463279008865356, + "num_tokens": 44185099.0, + "step": 1160 + }, + { + "epoch": 0.14769113344358223, + "ewc_loss": 3.904104232788086e-06, + "grad_norm": 1.5989670753479004, + "learning_rate": 4.917337855023314e-07, + "loss": 0.4426, + "mean_token_accuracy": 0.8537838459014893, + "num_tokens": 44223439.0, + "step": 1161 + }, + { + "epoch": 0.14781834372217276, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.701720952987671, + "learning_rate": 4.921576939381094e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8514585494995117, + "num_tokens": 44257661.0, + "step": 1162 + }, + { + "epoch": 0.14794555400076326, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.8404077291488647, + "learning_rate": 4.925816023738872e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.832554817199707, + "num_tokens": 44293683.0, + "step": 1163 + }, + { + "epoch": 0.14807276427935376, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.7843939065933228, + "learning_rate": 4.930055108096651e-07, + "loss": 0.4392, + "mean_token_accuracy": 0.855071485042572, + "num_tokens": 44325993.0, + "step": 1164 + }, + { + "epoch": 0.1481999745579443, + "ewc_loss": 3.933906555175781e-06, + "grad_norm": 1.7062760591506958, + "learning_rate": 4.934294192454429e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.8519002199172974, + "num_tokens": 44362116.0, + "step": 1165 + }, + { + "epoch": 0.1483271848365348, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.6724412441253662, + "learning_rate": 4.938533276812209e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8412833213806152, + "num_tokens": 44400271.0, + "step": 1166 + }, + { + "epoch": 0.14845439511512531, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.6118865013122559, + "learning_rate": 4.942772361169987e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8461829423904419, + "num_tokens": 44444728.0, + "step": 1167 + }, + { + "epoch": 0.14858160539371582, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.6482656002044678, + "learning_rate": 4.947011445527766e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8412904143333435, + "num_tokens": 44486953.0, + "step": 1168 + }, + { + "epoch": 0.14870881567230632, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.596290111541748, + "learning_rate": 4.951250529885544e-07, + "loss": 0.4189, + "mean_token_accuracy": 0.8600557446479797, + "num_tokens": 44523585.0, + "step": 1169 + }, + { + "epoch": 0.14883602595089684, + "ewc_loss": 3.993511199951172e-06, + "grad_norm": 1.5193837881088257, + "learning_rate": 4.955489614243324e-07, + "loss": 0.5001, + "mean_token_accuracy": 0.8384252786636353, + "num_tokens": 44565822.0, + "step": 1170 + }, + { + "epoch": 0.14896323622948734, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.6983951330184937, + "learning_rate": 4.959728698601102e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8414914608001709, + "num_tokens": 44603512.0, + "step": 1171 + }, + { + "epoch": 0.14909044650807785, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.6932127475738525, + "learning_rate": 4.963967782958881e-07, + "loss": 0.4418, + "mean_token_accuracy": 0.8533716797828674, + "num_tokens": 44641908.0, + "step": 1172 + }, + { + "epoch": 0.14921765678666837, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.9344559907913208, + "learning_rate": 4.968206867316659e-07, + "loss": 0.514, + "mean_token_accuracy": 0.8314075469970703, + "num_tokens": 44673413.0, + "step": 1173 + }, + { + "epoch": 0.14934486706525887, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.6009888648986816, + "learning_rate": 4.972445951674439e-07, + "loss": 0.4995, + "mean_token_accuracy": 0.8390047550201416, + "num_tokens": 44711948.0, + "step": 1174 + }, + { + "epoch": 0.14947207734384937, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.660029649734497, + "learning_rate": 4.976685036032216e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8382996916770935, + "num_tokens": 44748494.0, + "step": 1175 + }, + { + "epoch": 0.1495992876224399, + "ewc_loss": 4.023313522338867e-06, + "grad_norm": 1.5159059762954712, + "learning_rate": 4.980924120389996e-07, + "loss": 0.4537, + "mean_token_accuracy": 0.8513103127479553, + "num_tokens": 44790881.0, + "step": 1176 + }, + { + "epoch": 0.1497264979010304, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.7104040384292603, + "learning_rate": 4.985163204747774e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8323589563369751, + "num_tokens": 44827538.0, + "step": 1177 + }, + { + "epoch": 0.1498537081796209, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.5974054336547852, + "learning_rate": 4.989402289105554e-07, + "loss": 0.5213, + "mean_token_accuracy": 0.8327499628067017, + "num_tokens": 44869528.0, + "step": 1178 + }, + { + "epoch": 0.14998091845821143, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.5072730779647827, + "learning_rate": 4.993641373463331e-07, + "loss": 0.416, + "mean_token_accuracy": 0.8598261475563049, + "num_tokens": 44908736.0, + "step": 1179 + }, + { + "epoch": 0.15010812873680193, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.5293196439743042, + "learning_rate": 4.997880457821111e-07, + "loss": 0.4353, + "mean_token_accuracy": 0.8556323051452637, + "num_tokens": 44947620.0, + "step": 1180 + }, + { + "epoch": 0.15023533901539243, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.58290696144104, + "learning_rate": 5.002119542178889e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8517722487449646, + "num_tokens": 44989689.0, + "step": 1181 + }, + { + "epoch": 0.15036254929398296, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.5025957822799683, + "learning_rate": 5.006358626536667e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8498106002807617, + "num_tokens": 45029147.0, + "step": 1182 + }, + { + "epoch": 0.15048975957257346, + "ewc_loss": 4.0531158447265625e-06, + "grad_norm": 1.6435056924819946, + "learning_rate": 5.010597710894446e-07, + "loss": 0.4508, + "mean_token_accuracy": 0.8488824963569641, + "num_tokens": 45062752.0, + "step": 1183 + }, + { + "epoch": 0.15061696985116396, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.592446208000183, + "learning_rate": 5.014836795252225e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8482146263122559, + "num_tokens": 45101974.0, + "step": 1184 + }, + { + "epoch": 0.1507441801297545, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.613279938697815, + "learning_rate": 5.019075879610004e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8606334924697876, + "num_tokens": 45138069.0, + "step": 1185 + }, + { + "epoch": 0.150871390408345, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.5289804935455322, + "learning_rate": 5.023314963967783e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.8538779020309448, + "num_tokens": 45176755.0, + "step": 1186 + }, + { + "epoch": 0.1509986006869355, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.607720971107483, + "learning_rate": 5.027554048325562e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8635940551757812, + "num_tokens": 45211527.0, + "step": 1187 + }, + { + "epoch": 0.15112581096552602, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.6501809358596802, + "learning_rate": 5.03179313268334e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8423969745635986, + "num_tokens": 45248248.0, + "step": 1188 + }, + { + "epoch": 0.15125302124411652, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.6569371223449707, + "learning_rate": 5.036032217041119e-07, + "loss": 0.4597, + "mean_token_accuracy": 0.8510008454322815, + "num_tokens": 45284732.0, + "step": 1189 + }, + { + "epoch": 0.15138023152270705, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.5449849367141724, + "learning_rate": 5.040271301398897e-07, + "loss": 0.4341, + "mean_token_accuracy": 0.8568721413612366, + "num_tokens": 45321720.0, + "step": 1190 + }, + { + "epoch": 0.15150744180129755, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.5239636898040771, + "learning_rate": 5.044510385756676e-07, + "loss": 0.4145, + "mean_token_accuracy": 0.8633008003234863, + "num_tokens": 45360330.0, + "step": 1191 + }, + { + "epoch": 0.15163465207988805, + "ewc_loss": 4.112720489501953e-06, + "grad_norm": 1.4464448690414429, + "learning_rate": 5.048749470114455e-07, + "loss": 0.4184, + "mean_token_accuracy": 0.8639201521873474, + "num_tokens": 45401809.0, + "step": 1192 + }, + { + "epoch": 0.15176186235847858, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.5170618295669556, + "learning_rate": 5.052988554472234e-07, + "loss": 0.4939, + "mean_token_accuracy": 0.8356643915176392, + "num_tokens": 45447005.0, + "step": 1193 + }, + { + "epoch": 0.15188907263706908, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.5436497926712036, + "learning_rate": 5.057227638830013e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8467994928359985, + "num_tokens": 45488807.0, + "step": 1194 + }, + { + "epoch": 0.15201628291565958, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.5617969036102295, + "learning_rate": 5.061466723187792e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8422380089759827, + "num_tokens": 45529261.0, + "step": 1195 + }, + { + "epoch": 0.1521434931942501, + "ewc_loss": 4.082918167114258e-06, + "grad_norm": 1.5991021394729614, + "learning_rate": 5.065705807545569e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.852082371711731, + "num_tokens": 45571054.0, + "step": 1196 + }, + { + "epoch": 0.1522707034728406, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.6491549015045166, + "learning_rate": 5.069944891903349e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8420889377593994, + "num_tokens": 45608425.0, + "step": 1197 + }, + { + "epoch": 0.1523979137514311, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.8126534223556519, + "learning_rate": 5.074183976261127e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8375309705734253, + "num_tokens": 45647963.0, + "step": 1198 + }, + { + "epoch": 0.15252512403002164, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.6750216484069824, + "learning_rate": 5.078423060618906e-07, + "loss": 0.4612, + "mean_token_accuracy": 0.8480629324913025, + "num_tokens": 45684546.0, + "step": 1199 + }, + { + "epoch": 0.15265233430861214, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.6308659315109253, + "learning_rate": 5.082662144976685e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8420127630233765, + "num_tokens": 45722721.0, + "step": 1200 + }, + { + "epoch": 0.15277954458720264, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.590299129486084, + "learning_rate": 5.086901229334464e-07, + "loss": 0.4266, + "mean_token_accuracy": 0.8593931198120117, + "num_tokens": 45761693.0, + "step": 1201 + }, + { + "epoch": 0.15290675486579317, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.7063761949539185, + "learning_rate": 5.091140313692243e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8368120193481445, + "num_tokens": 45795535.0, + "step": 1202 + }, + { + "epoch": 0.15303396514438367, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.644837498664856, + "learning_rate": 5.095379398050022e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8352811932563782, + "num_tokens": 45837988.0, + "step": 1203 + }, + { + "epoch": 0.15316117542297417, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.5545227527618408, + "learning_rate": 5.099618482407799e-07, + "loss": 0.4097, + "mean_token_accuracy": 0.865265965461731, + "num_tokens": 45877272.0, + "step": 1204 + }, + { + "epoch": 0.1532883857015647, + "ewc_loss": 4.1425228118896484e-06, + "grad_norm": 1.7219759225845337, + "learning_rate": 5.103857566765578e-07, + "loss": 0.4935, + "mean_token_accuracy": 0.8397432565689087, + "num_tokens": 45912200.0, + "step": 1205 + }, + { + "epoch": 0.1534155959801552, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.6308938264846802, + "learning_rate": 5.108096651123357e-07, + "loss": 0.4882, + "mean_token_accuracy": 0.8414818644523621, + "num_tokens": 45953093.0, + "step": 1206 + }, + { + "epoch": 0.1535428062587457, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.6488010883331299, + "learning_rate": 5.112335735481135e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8426270484924316, + "num_tokens": 45990389.0, + "step": 1207 + }, + { + "epoch": 0.15367001653733622, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.5296036005020142, + "learning_rate": 5.116574819838915e-07, + "loss": 0.4071, + "mean_token_accuracy": 0.8674785494804382, + "num_tokens": 46027196.0, + "step": 1208 + }, + { + "epoch": 0.15379722681592672, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.6959476470947266, + "learning_rate": 5.120813904196693e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8414665460586548, + "num_tokens": 46061385.0, + "step": 1209 + }, + { + "epoch": 0.15392443709451722, + "ewc_loss": 4.172325134277344e-06, + "grad_norm": 1.680820345878601, + "learning_rate": 5.125052988554473e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8512246012687683, + "num_tokens": 46095234.0, + "step": 1210 + }, + { + "epoch": 0.15405164737310775, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.5964691638946533, + "learning_rate": 5.12929207291225e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8276059627532959, + "num_tokens": 46133764.0, + "step": 1211 + }, + { + "epoch": 0.15417885765169825, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.8522121906280518, + "learning_rate": 5.133531157270029e-07, + "loss": 0.5094, + "mean_token_accuracy": 0.8369308710098267, + "num_tokens": 46163714.0, + "step": 1212 + }, + { + "epoch": 0.15430606793028875, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.7043761014938354, + "learning_rate": 5.137770241627808e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8425151109695435, + "num_tokens": 46198752.0, + "step": 1213 + }, + { + "epoch": 0.15443327820887928, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.5201517343521118, + "learning_rate": 5.142009325985587e-07, + "loss": 0.428, + "mean_token_accuracy": 0.8576245307922363, + "num_tokens": 46239585.0, + "step": 1214 + }, + { + "epoch": 0.15456048848746978, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.5719420909881592, + "learning_rate": 5.146248410343365e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8540674448013306, + "num_tokens": 46274174.0, + "step": 1215 + }, + { + "epoch": 0.1546876987660603, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.5530714988708496, + "learning_rate": 5.150487494701145e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8380340337753296, + "num_tokens": 46315335.0, + "step": 1216 + }, + { + "epoch": 0.1548149090446508, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.6147363185882568, + "learning_rate": 5.154726579058923e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8512758016586304, + "num_tokens": 46356394.0, + "step": 1217 + }, + { + "epoch": 0.1549421193232413, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.8118335008621216, + "learning_rate": 5.158965663416703e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8399226069450378, + "num_tokens": 46389488.0, + "step": 1218 + }, + { + "epoch": 0.15506932960183184, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.5818251371383667, + "learning_rate": 5.16320474777448e-07, + "loss": 0.4514, + "mean_token_accuracy": 0.8482372760772705, + "num_tokens": 46431362.0, + "step": 1219 + }, + { + "epoch": 0.15519653988042234, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.6199042797088623, + "learning_rate": 5.167443832132259e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8499637246131897, + "num_tokens": 46471450.0, + "step": 1220 + }, + { + "epoch": 0.15532375015901284, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.5256686210632324, + "learning_rate": 5.171682916490038e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8433188199996948, + "num_tokens": 46511675.0, + "step": 1221 + }, + { + "epoch": 0.15545096043760337, + "ewc_loss": 4.202127456665039e-06, + "grad_norm": 1.5004230737686157, + "learning_rate": 5.175922000847816e-07, + "loss": 0.4441, + "mean_token_accuracy": 0.8581641912460327, + "num_tokens": 46551982.0, + "step": 1222 + }, + { + "epoch": 0.15557817071619387, + "ewc_loss": 4.26173210144043e-06, + "grad_norm": 1.5965404510498047, + "learning_rate": 5.180161085205595e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8364549279212952, + "num_tokens": 46592738.0, + "step": 1223 + }, + { + "epoch": 0.15570538099478437, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 1.454842209815979, + "learning_rate": 5.184400169563374e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8543595671653748, + "num_tokens": 46637240.0, + "step": 1224 + }, + { + "epoch": 0.1558325912733749, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 1.5607446432113647, + "learning_rate": 5.188639253921153e-07, + "loss": 0.447, + "mean_token_accuracy": 0.85770183801651, + "num_tokens": 46675699.0, + "step": 1225 + }, + { + "epoch": 0.1559598015519654, + "ewc_loss": 4.291534423828125e-06, + "grad_norm": 1.5824488401412964, + "learning_rate": 5.192878338278932e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8358290195465088, + "num_tokens": 46719314.0, + "step": 1226 + }, + { + "epoch": 0.1560870118305559, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.6344072818756104, + "learning_rate": 5.19711742263671e-07, + "loss": 0.4253, + "mean_token_accuracy": 0.8614429235458374, + "num_tokens": 46754512.0, + "step": 1227 + }, + { + "epoch": 0.15621422210914643, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.673387050628662, + "learning_rate": 5.201356506994488e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8415015339851379, + "num_tokens": 46792197.0, + "step": 1228 + }, + { + "epoch": 0.15634143238773693, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.5792688131332397, + "learning_rate": 5.205595591352268e-07, + "loss": 0.4155, + "mean_token_accuracy": 0.8615263104438782, + "num_tokens": 46828318.0, + "step": 1229 + }, + { + "epoch": 0.15646864266632743, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.7090632915496826, + "learning_rate": 5.209834675710046e-07, + "loss": 0.4168, + "mean_token_accuracy": 0.8581997156143188, + "num_tokens": 46860917.0, + "step": 1230 + }, + { + "epoch": 0.15659585294491796, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.528502345085144, + "learning_rate": 5.214073760067825e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8461065292358398, + "num_tokens": 46904364.0, + "step": 1231 + }, + { + "epoch": 0.15672306322350846, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.7407232522964478, + "learning_rate": 5.218312844425604e-07, + "loss": 0.3995, + "mean_token_accuracy": 0.8678889274597168, + "num_tokens": 46936586.0, + "step": 1232 + }, + { + "epoch": 0.15685027350209896, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.6491484642028809, + "learning_rate": 5.222551928783383e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8556437492370605, + "num_tokens": 46971713.0, + "step": 1233 + }, + { + "epoch": 0.1569774837806895, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.7241204977035522, + "learning_rate": 5.226791013141161e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8543705940246582, + "num_tokens": 47005518.0, + "step": 1234 + }, + { + "epoch": 0.15710469405928, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.6599998474121094, + "learning_rate": 5.23103009749894e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.8457205891609192, + "num_tokens": 47045268.0, + "step": 1235 + }, + { + "epoch": 0.1572319043378705, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.8397424221038818, + "learning_rate": 5.235269181856718e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8361798524856567, + "num_tokens": 47078928.0, + "step": 1236 + }, + { + "epoch": 0.15735911461646102, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.7163825035095215, + "learning_rate": 5.239508266214498e-07, + "loss": 0.4336, + "mean_token_accuracy": 0.8596937656402588, + "num_tokens": 47114933.0, + "step": 1237 + }, + { + "epoch": 0.15748632489505152, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.5998071432113647, + "learning_rate": 5.243747350572276e-07, + "loss": 0.5269, + "mean_token_accuracy": 0.8284651041030884, + "num_tokens": 47156926.0, + "step": 1238 + }, + { + "epoch": 0.15761353517364202, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.567944884300232, + "learning_rate": 5.247986434930056e-07, + "loss": 0.4211, + "mean_token_accuracy": 0.8628448843955994, + "num_tokens": 47195963.0, + "step": 1239 + }, + { + "epoch": 0.15774074545223254, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.6475857496261597, + "learning_rate": 5.252225519287834e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8456642627716064, + "num_tokens": 47233000.0, + "step": 1240 + }, + { + "epoch": 0.15786795573082305, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 2.835000514984131, + "learning_rate": 5.256464603645613e-07, + "loss": 0.4129, + "mean_token_accuracy": 0.861321210861206, + "num_tokens": 47269263.0, + "step": 1241 + }, + { + "epoch": 0.15799516600941357, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.7288535833358765, + "learning_rate": 5.260703688003391e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8491373062133789, + "num_tokens": 47310560.0, + "step": 1242 + }, + { + "epoch": 0.15812237628800407, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.58292818069458, + "learning_rate": 5.26494277236117e-07, + "loss": 0.4142, + "mean_token_accuracy": 0.8633084297180176, + "num_tokens": 47347742.0, + "step": 1243 + }, + { + "epoch": 0.15824958656659457, + "ewc_loss": 4.32133674621582e-06, + "grad_norm": 1.4876189231872559, + "learning_rate": 5.269181856718948e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.8585451245307922, + "num_tokens": 47390645.0, + "step": 1244 + }, + { + "epoch": 0.1583767968451851, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.6134134531021118, + "learning_rate": 5.273420941076727e-07, + "loss": 0.493, + "mean_token_accuracy": 0.8448745608329773, + "num_tokens": 47430853.0, + "step": 1245 + }, + { + "epoch": 0.1585040071237756, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.571094274520874, + "learning_rate": 5.277660025434506e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8499249815940857, + "num_tokens": 47469432.0, + "step": 1246 + }, + { + "epoch": 0.1586312174023661, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.6634471416473389, + "learning_rate": 5.281899109792285e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8355691432952881, + "num_tokens": 47507117.0, + "step": 1247 + }, + { + "epoch": 0.15875842768095663, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.5919138193130493, + "learning_rate": 5.286138194150064e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8551953434944153, + "num_tokens": 47547450.0, + "step": 1248 + }, + { + "epoch": 0.15888563795954713, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.459146499633789, + "learning_rate": 5.290377278507841e-07, + "loss": 0.4598, + "mean_token_accuracy": 0.8495786190032959, + "num_tokens": 47588288.0, + "step": 1249 + }, + { + "epoch": 0.15901284823813763, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.5151768922805786, + "learning_rate": 5.294616362865621e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8552212119102478, + "num_tokens": 47632499.0, + "step": 1250 + }, + { + "epoch": 0.15914005851672816, + "ewc_loss": 4.351139068603516e-06, + "grad_norm": 1.6940181255340576, + "learning_rate": 5.298855447223399e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8342736959457397, + "num_tokens": 47671232.0, + "step": 1251 + }, + { + "epoch": 0.15926726879531866, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.6328917741775513, + "learning_rate": 5.303094531581178e-07, + "loss": 0.5565, + "mean_token_accuracy": 0.8190599679946899, + "num_tokens": 47713808.0, + "step": 1252 + }, + { + "epoch": 0.15939447907390916, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.7248135805130005, + "learning_rate": 5.307333615938957e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8462373614311218, + "num_tokens": 47747423.0, + "step": 1253 + }, + { + "epoch": 0.1595216893524997, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.6623698472976685, + "learning_rate": 5.311572700296736e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8532065749168396, + "num_tokens": 47783741.0, + "step": 1254 + }, + { + "epoch": 0.1596488996310902, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.6064928770065308, + "learning_rate": 5.315811784654515e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8524705171585083, + "num_tokens": 47821202.0, + "step": 1255 + }, + { + "epoch": 0.1597761099096807, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.627332091331482, + "learning_rate": 5.320050869012294e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8444738984107971, + "num_tokens": 47859744.0, + "step": 1256 + }, + { + "epoch": 0.15990332018827122, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.640040636062622, + "learning_rate": 5.324289953370071e-07, + "loss": 0.5002, + "mean_token_accuracy": 0.8399423956871033, + "num_tokens": 47897123.0, + "step": 1257 + }, + { + "epoch": 0.16003053046686172, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.5932765007019043, + "learning_rate": 5.328529037727851e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8559650182723999, + "num_tokens": 47935049.0, + "step": 1258 + }, + { + "epoch": 0.16015774074545222, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.604987621307373, + "learning_rate": 5.332768122085629e-07, + "loss": 0.426, + "mean_token_accuracy": 0.8579328060150146, + "num_tokens": 47971304.0, + "step": 1259 + }, + { + "epoch": 0.16028495102404275, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.547836422920227, + "learning_rate": 5.337007206443408e-07, + "loss": 0.4134, + "mean_token_accuracy": 0.8611801862716675, + "num_tokens": 48010714.0, + "step": 1260 + }, + { + "epoch": 0.16041216130263325, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8314882516860962, + "learning_rate": 5.341246290801187e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.8492147922515869, + "num_tokens": 48042661.0, + "step": 1261 + }, + { + "epoch": 0.16053937158122375, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.8996614217758179, + "learning_rate": 5.345485375158966e-07, + "loss": 0.5954, + "mean_token_accuracy": 0.8167829513549805, + "num_tokens": 48073008.0, + "step": 1262 + }, + { + "epoch": 0.16066658185981428, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.6415597200393677, + "learning_rate": 5.349724459516745e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8547855615615845, + "num_tokens": 48106817.0, + "step": 1263 + }, + { + "epoch": 0.16079379213840478, + "ewc_loss": 4.380941390991211e-06, + "grad_norm": 1.5954971313476562, + "learning_rate": 5.353963543874522e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8631417751312256, + "num_tokens": 48142126.0, + "step": 1264 + }, + { + "epoch": 0.1609210024169953, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.5437324047088623, + "learning_rate": 5.358202628232301e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.8318182229995728, + "num_tokens": 48185143.0, + "step": 1265 + }, + { + "epoch": 0.1610482126955858, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.5150710344314575, + "learning_rate": 5.36244171259008e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.8545811176300049, + "num_tokens": 48226936.0, + "step": 1266 + }, + { + "epoch": 0.1611754229741763, + "ewc_loss": 4.410743713378906e-06, + "grad_norm": 1.608267903327942, + "learning_rate": 5.366680796947859e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8485168218612671, + "num_tokens": 48263902.0, + "step": 1267 + }, + { + "epoch": 0.16130263325276684, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.5324510335922241, + "learning_rate": 5.370919881305637e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8490589261054993, + "num_tokens": 48302261.0, + "step": 1268 + }, + { + "epoch": 0.16142984353135734, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.7386469841003418, + "learning_rate": 5.375158965663417e-07, + "loss": 0.4766, + "mean_token_accuracy": 0.8452510833740234, + "num_tokens": 48342338.0, + "step": 1269 + }, + { + "epoch": 0.16155705380994784, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.600877285003662, + "learning_rate": 5.379398050021195e-07, + "loss": 0.4646, + "mean_token_accuracy": 0.8439211845397949, + "num_tokens": 48378646.0, + "step": 1270 + }, + { + "epoch": 0.16168426408853837, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.5196495056152344, + "learning_rate": 5.383637134378975e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8499593734741211, + "num_tokens": 48421394.0, + "step": 1271 + }, + { + "epoch": 0.16181147436712887, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.555908441543579, + "learning_rate": 5.387876218736752e-07, + "loss": 0.4269, + "mean_token_accuracy": 0.8617026805877686, + "num_tokens": 48460163.0, + "step": 1272 + }, + { + "epoch": 0.16193868464571937, + "ewc_loss": 4.4405460357666016e-06, + "grad_norm": 1.4917553663253784, + "learning_rate": 5.392115303094531e-07, + "loss": 0.4253, + "mean_token_accuracy": 0.8612025380134583, + "num_tokens": 48500107.0, + "step": 1273 + }, + { + "epoch": 0.1620658949243099, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.5405282974243164, + "learning_rate": 5.39635438745231e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8426187038421631, + "num_tokens": 48543172.0, + "step": 1274 + }, + { + "epoch": 0.1621931052029004, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.5946695804595947, + "learning_rate": 5.400593471810089e-07, + "loss": 0.5102, + "mean_token_accuracy": 0.8345056176185608, + "num_tokens": 48585079.0, + "step": 1275 + }, + { + "epoch": 0.1623203154814909, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.5615159273147583, + "learning_rate": 5.404832556167867e-07, + "loss": 0.5113, + "mean_token_accuracy": 0.8324398994445801, + "num_tokens": 48627139.0, + "step": 1276 + }, + { + "epoch": 0.16244752576008142, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.6078968048095703, + "learning_rate": 5.409071640525647e-07, + "loss": 0.4588, + "mean_token_accuracy": 0.846666693687439, + "num_tokens": 48664895.0, + "step": 1277 + }, + { + "epoch": 0.16257473603867192, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.6895878314971924, + "learning_rate": 5.413310724883425e-07, + "loss": 0.3824, + "mean_token_accuracy": 0.8708474636077881, + "num_tokens": 48697325.0, + "step": 1278 + }, + { + "epoch": 0.16270194631726242, + "ewc_loss": 4.500150680541992e-06, + "grad_norm": 1.5376570224761963, + "learning_rate": 5.417549809241205e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8476795554161072, + "num_tokens": 48737995.0, + "step": 1279 + }, + { + "epoch": 0.16282915659585295, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.6699888706207275, + "learning_rate": 5.421788893598982e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8463518023490906, + "num_tokens": 48775130.0, + "step": 1280 + }, + { + "epoch": 0.16295636687444345, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.6551789045333862, + "learning_rate": 5.42602797795676e-07, + "loss": 0.5282, + "mean_token_accuracy": 0.8266767263412476, + "num_tokens": 48817621.0, + "step": 1281 + }, + { + "epoch": 0.16308357715303395, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.6761645078659058, + "learning_rate": 5.43026706231454e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8610422015190125, + "num_tokens": 48849331.0, + "step": 1282 + }, + { + "epoch": 0.16321078743162448, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.6022469997406006, + "learning_rate": 5.434506146672319e-07, + "loss": 0.4742, + "mean_token_accuracy": 0.84833163022995, + "num_tokens": 48887266.0, + "step": 1283 + }, + { + "epoch": 0.16333799771021498, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.5771223306655884, + "learning_rate": 5.438745231030097e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8497837781906128, + "num_tokens": 48924988.0, + "step": 1284 + }, + { + "epoch": 0.16346520798880548, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.6433378458023071, + "learning_rate": 5.442984315387876e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8362824320793152, + "num_tokens": 48961487.0, + "step": 1285 + }, + { + "epoch": 0.163592418267396, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.6200605630874634, + "learning_rate": 5.447223399745655e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8385003805160522, + "num_tokens": 49000070.0, + "step": 1286 + }, + { + "epoch": 0.1637196285459865, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.6128792762756348, + "learning_rate": 5.451462484103433e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8582603931427002, + "num_tokens": 49036606.0, + "step": 1287 + }, + { + "epoch": 0.163846838824577, + "ewc_loss": 4.5299530029296875e-06, + "grad_norm": 1.704252004623413, + "learning_rate": 5.455701568461212e-07, + "loss": 0.4081, + "mean_token_accuracy": 0.8658541440963745, + "num_tokens": 49068638.0, + "step": 1288 + }, + { + "epoch": 0.16397404910316754, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.4722758531570435, + "learning_rate": 5.45994065281899e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.8517658710479736, + "num_tokens": 49111049.0, + "step": 1289 + }, + { + "epoch": 0.16410125938175804, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.6048277616500854, + "learning_rate": 5.46417973717677e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8376628160476685, + "num_tokens": 49151935.0, + "step": 1290 + }, + { + "epoch": 0.16422846966034857, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.5820115804672241, + "learning_rate": 5.468418821534548e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8452348113059998, + "num_tokens": 49190273.0, + "step": 1291 + }, + { + "epoch": 0.16435567993893907, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.645084023475647, + "learning_rate": 5.472657905892327e-07, + "loss": 0.5356, + "mean_token_accuracy": 0.8270571231842041, + "num_tokens": 49225978.0, + "step": 1292 + }, + { + "epoch": 0.16448289021752957, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.6479524374008179, + "learning_rate": 5.476896990250106e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8577158451080322, + "num_tokens": 49260285.0, + "step": 1293 + }, + { + "epoch": 0.1646101004961201, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.5815988779067993, + "learning_rate": 5.481136074607885e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8541949987411499, + "num_tokens": 49297388.0, + "step": 1294 + }, + { + "epoch": 0.1647373107747106, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.511745572090149, + "learning_rate": 5.485375158965663e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8538168668746948, + "num_tokens": 49340756.0, + "step": 1295 + }, + { + "epoch": 0.1648645210533011, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.6293296813964844, + "learning_rate": 5.489614243323442e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8448219299316406, + "num_tokens": 49377786.0, + "step": 1296 + }, + { + "epoch": 0.16499173133189163, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.5376901626586914, + "learning_rate": 5.49385332768122e-07, + "loss": 0.4703, + "mean_token_accuracy": 0.8502914905548096, + "num_tokens": 49418141.0, + "step": 1297 + }, + { + "epoch": 0.16511894161048213, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.5896494388580322, + "learning_rate": 5.498092412039e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8476241827011108, + "num_tokens": 49460422.0, + "step": 1298 + }, + { + "epoch": 0.16524615188907263, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 1.5564475059509277, + "learning_rate": 5.502331496396778e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.845556378364563, + "num_tokens": 49501430.0, + "step": 1299 + }, + { + "epoch": 0.16537336216766316, + "ewc_loss": 4.559755325317383e-06, + "grad_norm": 1.6472276449203491, + "learning_rate": 5.506570580754557e-07, + "loss": 0.4731, + "mean_token_accuracy": 0.8460913300514221, + "num_tokens": 49539471.0, + "step": 1300 + }, + { + "epoch": 0.16550057244625366, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 1.4946424961090088, + "learning_rate": 5.510809665112336e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8554078340530396, + "num_tokens": 49585811.0, + "step": 1301 + }, + { + "epoch": 0.16562778272484416, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 1.5753134489059448, + "learning_rate": 5.515048749470113e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8447113037109375, + "num_tokens": 49627523.0, + "step": 1302 + }, + { + "epoch": 0.1657549930034347, + "ewc_loss": 4.589557647705078e-06, + "grad_norm": 1.6821134090423584, + "learning_rate": 5.519287833827893e-07, + "loss": 0.4137, + "mean_token_accuracy": 0.8594403266906738, + "num_tokens": 49658620.0, + "step": 1303 + }, + { + "epoch": 0.1658822032820252, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.630969762802124, + "learning_rate": 5.523526918185671e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8425639867782593, + "num_tokens": 49696243.0, + "step": 1304 + }, + { + "epoch": 0.1660094135606157, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.549527645111084, + "learning_rate": 5.52776600254345e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8460525274276733, + "num_tokens": 49738585.0, + "step": 1305 + }, + { + "epoch": 0.16613662383920622, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.804520845413208, + "learning_rate": 5.532005086901229e-07, + "loss": 0.571, + "mean_token_accuracy": 0.810647189617157, + "num_tokens": 49774497.0, + "step": 1306 + }, + { + "epoch": 0.16626383411779672, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.6817952394485474, + "learning_rate": 5.536244171259008e-07, + "loss": 0.4188, + "mean_token_accuracy": 0.8605995178222656, + "num_tokens": 49812144.0, + "step": 1307 + }, + { + "epoch": 0.16639104439638722, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.6737815141677856, + "learning_rate": 5.540483255616786e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.842090368270874, + "num_tokens": 49848695.0, + "step": 1308 + }, + { + "epoch": 0.16651825467497774, + "ewc_loss": 4.6193599700927734e-06, + "grad_norm": 1.6019326448440552, + "learning_rate": 5.544722339974566e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.842831552028656, + "num_tokens": 49887981.0, + "step": 1309 + }, + { + "epoch": 0.16664546495356825, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.677253246307373, + "learning_rate": 5.548961424332343e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8473255634307861, + "num_tokens": 49925114.0, + "step": 1310 + }, + { + "epoch": 0.16677267523215875, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.6977046728134155, + "learning_rate": 5.553200508690123e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8386228680610657, + "num_tokens": 49963745.0, + "step": 1311 + }, + { + "epoch": 0.16689988551074927, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.630673885345459, + "learning_rate": 5.557439593047901e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8510862588882446, + "num_tokens": 50003681.0, + "step": 1312 + }, + { + "epoch": 0.16702709578933977, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.603300929069519, + "learning_rate": 5.56167867740568e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8437076807022095, + "num_tokens": 50043990.0, + "step": 1313 + }, + { + "epoch": 0.16715430606793028, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.5775014162063599, + "learning_rate": 5.565917761763459e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.843492865562439, + "num_tokens": 50083603.0, + "step": 1314 + }, + { + "epoch": 0.1672815163465208, + "ewc_loss": 4.649162292480469e-06, + "grad_norm": 1.6102626323699951, + "learning_rate": 5.570156846121238e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.849230170249939, + "num_tokens": 50119443.0, + "step": 1315 + }, + { + "epoch": 0.1674087266251113, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.6089259386062622, + "learning_rate": 5.574395930479016e-07, + "loss": 0.4368, + "mean_token_accuracy": 0.8554395437240601, + "num_tokens": 50156844.0, + "step": 1316 + }, + { + "epoch": 0.16753593690370183, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.5572878122329712, + "learning_rate": 5.578635014836796e-07, + "loss": 0.4002, + "mean_token_accuracy": 0.8684311509132385, + "num_tokens": 50194947.0, + "step": 1317 + }, + { + "epoch": 0.16766314718229233, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.707613468170166, + "learning_rate": 5.582874099194573e-07, + "loss": 0.5134, + "mean_token_accuracy": 0.8350629806518555, + "num_tokens": 50233117.0, + "step": 1318 + }, + { + "epoch": 0.16779035746088283, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.6518405675888062, + "learning_rate": 5.587113183552353e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8371082544326782, + "num_tokens": 50272813.0, + "step": 1319 + }, + { + "epoch": 0.16791756773947336, + "ewc_loss": 4.678964614868164e-06, + "grad_norm": 1.6101164817810059, + "learning_rate": 5.591352267910131e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8487342596054077, + "num_tokens": 50309356.0, + "step": 1320 + }, + { + "epoch": 0.16804477801806386, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.5677942037582397, + "learning_rate": 5.59559135226791e-07, + "loss": 0.4427, + "mean_token_accuracy": 0.8571126461029053, + "num_tokens": 50350435.0, + "step": 1321 + }, + { + "epoch": 0.16817198829665436, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.6909645795822144, + "learning_rate": 5.599830436625689e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8396005630493164, + "num_tokens": 50386405.0, + "step": 1322 + }, + { + "epoch": 0.1682991985752449, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.4904556274414062, + "learning_rate": 5.604069520983468e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8482199907302856, + "num_tokens": 50432013.0, + "step": 1323 + }, + { + "epoch": 0.1684264088538354, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 4.587197303771973, + "learning_rate": 5.608308605341246e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8489094972610474, + "num_tokens": 50474687.0, + "step": 1324 + }, + { + "epoch": 0.1685536191324259, + "ewc_loss": 4.708766937255859e-06, + "grad_norm": 1.5942550897598267, + "learning_rate": 5.612547689699024e-07, + "loss": 0.444, + "mean_token_accuracy": 0.8567541837692261, + "num_tokens": 50514230.0, + "step": 1325 + }, + { + "epoch": 0.16868082941101642, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.5998271703720093, + "learning_rate": 5.616786774056803e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8452863693237305, + "num_tokens": 50551861.0, + "step": 1326 + }, + { + "epoch": 0.16880803968960692, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.5087943077087402, + "learning_rate": 5.621025858414582e-07, + "loss": 0.4444, + "mean_token_accuracy": 0.8529291152954102, + "num_tokens": 50595318.0, + "step": 1327 + }, + { + "epoch": 0.16893524996819742, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.5419480800628662, + "learning_rate": 5.625264942772361e-07, + "loss": 0.3974, + "mean_token_accuracy": 0.8718889355659485, + "num_tokens": 50631939.0, + "step": 1328 + }, + { + "epoch": 0.16906246024678795, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.7026981115341187, + "learning_rate": 5.629504027130139e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8509114980697632, + "num_tokens": 50667466.0, + "step": 1329 + }, + { + "epoch": 0.16918967052537845, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.524073600769043, + "learning_rate": 5.633743111487919e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.853514552116394, + "num_tokens": 50707390.0, + "step": 1330 + }, + { + "epoch": 0.16931688080396895, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.6203380823135376, + "learning_rate": 5.637982195845697e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8464645147323608, + "num_tokens": 50747332.0, + "step": 1331 + }, + { + "epoch": 0.16944409108255948, + "ewc_loss": 4.738569259643555e-06, + "grad_norm": 1.5312755107879639, + "learning_rate": 5.642221280203476e-07, + "loss": 0.4659, + "mean_token_accuracy": 0.8464882373809814, + "num_tokens": 50791935.0, + "step": 1332 + }, + { + "epoch": 0.16957130136114998, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.5076425075531006, + "learning_rate": 5.646460364561254e-07, + "loss": 0.4247, + "mean_token_accuracy": 0.8600468635559082, + "num_tokens": 50831972.0, + "step": 1333 + }, + { + "epoch": 0.16969851163974048, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.5516926050186157, + "learning_rate": 5.650699448919033e-07, + "loss": 0.4243, + "mean_token_accuracy": 0.8606389760971069, + "num_tokens": 50871084.0, + "step": 1334 + }, + { + "epoch": 0.169825721918331, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.5858032703399658, + "learning_rate": 5.654938533276812e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8514383435249329, + "num_tokens": 50915108.0, + "step": 1335 + }, + { + "epoch": 0.1699529321969215, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.7289150953292847, + "learning_rate": 5.659177617634591e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8406251668930054, + "num_tokens": 50954349.0, + "step": 1336 + }, + { + "epoch": 0.170080142475512, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.5098540782928467, + "learning_rate": 5.663416701992369e-07, + "loss": 0.4085, + "mean_token_accuracy": 0.8635933995246887, + "num_tokens": 50993330.0, + "step": 1337 + }, + { + "epoch": 0.17020735275410254, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.559893250465393, + "learning_rate": 5.667655786350149e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8494203090667725, + "num_tokens": 51035424.0, + "step": 1338 + }, + { + "epoch": 0.17033456303269304, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.525743842124939, + "learning_rate": 5.671894870707927e-07, + "loss": 0.494, + "mean_token_accuracy": 0.8379696607589722, + "num_tokens": 51077932.0, + "step": 1339 + }, + { + "epoch": 0.17046177331128357, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.5617622137069702, + "learning_rate": 5.676133955065705e-07, + "loss": 0.4298, + "mean_token_accuracy": 0.8592427968978882, + "num_tokens": 51118225.0, + "step": 1340 + }, + { + "epoch": 0.17058898358987407, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.6714788675308228, + "learning_rate": 5.680373039423484e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8443191647529602, + "num_tokens": 51158297.0, + "step": 1341 + }, + { + "epoch": 0.17071619386846457, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.613566517829895, + "learning_rate": 5.684612123781263e-07, + "loss": 0.5212, + "mean_token_accuracy": 0.833150327205658, + "num_tokens": 51197327.0, + "step": 1342 + }, + { + "epoch": 0.1708434041470551, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.7227222919464111, + "learning_rate": 5.688851208139042e-07, + "loss": 0.4867, + "mean_token_accuracy": 0.840248167514801, + "num_tokens": 51231126.0, + "step": 1343 + }, + { + "epoch": 0.1709706144256456, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.6541849374771118, + "learning_rate": 5.69309029249682e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8348924517631531, + "num_tokens": 51268493.0, + "step": 1344 + }, + { + "epoch": 0.1710978247042361, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.499804139137268, + "learning_rate": 5.697329376854599e-07, + "loss": 0.4243, + "mean_token_accuracy": 0.8587339520454407, + "num_tokens": 51305126.0, + "step": 1345 + }, + { + "epoch": 0.17122503498282662, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.4643489122390747, + "learning_rate": 5.701568461212378e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8518767356872559, + "num_tokens": 51349179.0, + "step": 1346 + }, + { + "epoch": 0.17135224526141712, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.4779506921768188, + "learning_rate": 5.705807545570157e-07, + "loss": 0.449, + "mean_token_accuracy": 0.8528919219970703, + "num_tokens": 51393040.0, + "step": 1347 + }, + { + "epoch": 0.17147945554000762, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.606079339981079, + "learning_rate": 5.710046629927934e-07, + "loss": 0.4007, + "mean_token_accuracy": 0.8666421175003052, + "num_tokens": 51430392.0, + "step": 1348 + }, + { + "epoch": 0.17160666581859815, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.6485373973846436, + "learning_rate": 5.714285714285714e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8450362086296082, + "num_tokens": 51466355.0, + "step": 1349 + }, + { + "epoch": 0.17173387609718865, + "ewc_loss": 4.76837158203125e-06, + "grad_norm": 1.881533145904541, + "learning_rate": 5.718524798643492e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8361626863479614, + "num_tokens": 51499559.0, + "step": 1350 + }, + { + "epoch": 0.17186108637577915, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.552555799484253, + "learning_rate": 5.722763883001272e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8554294109344482, + "num_tokens": 51538934.0, + "step": 1351 + }, + { + "epoch": 0.17198829665436968, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.539227843284607, + "learning_rate": 5.72700296735905e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8468493819236755, + "num_tokens": 51581229.0, + "step": 1352 + }, + { + "epoch": 0.17211550693296018, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.675549864768982, + "learning_rate": 5.731242051716829e-07, + "loss": 0.4719, + "mean_token_accuracy": 0.8460043668746948, + "num_tokens": 51621285.0, + "step": 1353 + }, + { + "epoch": 0.17224271721155068, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.5815774202346802, + "learning_rate": 5.735481136074608e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8461981415748596, + "num_tokens": 51659864.0, + "step": 1354 + }, + { + "epoch": 0.1723699274901412, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.585539698600769, + "learning_rate": 5.739720220432386e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8421664237976074, + "num_tokens": 51702288.0, + "step": 1355 + }, + { + "epoch": 0.1724971377687317, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.6365267038345337, + "learning_rate": 5.743959304790164e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.8483246564865112, + "num_tokens": 51740628.0, + "step": 1356 + }, + { + "epoch": 0.1726243480473222, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.586118221282959, + "learning_rate": 5.748198389147944e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8388419151306152, + "num_tokens": 51777288.0, + "step": 1357 + }, + { + "epoch": 0.17275155832591274, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.633278727531433, + "learning_rate": 5.752437473505722e-07, + "loss": 0.5195, + "mean_token_accuracy": 0.8305565714836121, + "num_tokens": 51816989.0, + "step": 1358 + }, + { + "epoch": 0.17287876860450324, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.6025983095169067, + "learning_rate": 5.756676557863502e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8409066200256348, + "num_tokens": 51859427.0, + "step": 1359 + }, + { + "epoch": 0.17300597888309374, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.7774323225021362, + "learning_rate": 5.76091564222128e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8354287147521973, + "num_tokens": 51894503.0, + "step": 1360 + }, + { + "epoch": 0.17313318916168427, + "ewc_loss": 4.798173904418945e-06, + "grad_norm": 1.51394522190094, + "learning_rate": 5.765154726579059e-07, + "loss": 0.418, + "mean_token_accuracy": 0.8597627282142639, + "num_tokens": 51934066.0, + "step": 1361 + }, + { + "epoch": 0.17326039944027477, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.5409623384475708, + "learning_rate": 5.769393810936838e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.8590888977050781, + "num_tokens": 51973725.0, + "step": 1362 + }, + { + "epoch": 0.17338760971886527, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.629631757736206, + "learning_rate": 5.773632895294616e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.842519998550415, + "num_tokens": 52012456.0, + "step": 1363 + }, + { + "epoch": 0.1735148199974558, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.6425806283950806, + "learning_rate": 5.777871979652394e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8357195258140564, + "num_tokens": 52053915.0, + "step": 1364 + }, + { + "epoch": 0.1736420302760463, + "ewc_loss": 4.827976226806641e-06, + "grad_norm": 1.7769527435302734, + "learning_rate": 5.782111064010173e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8457587361335754, + "num_tokens": 52084755.0, + "step": 1365 + }, + { + "epoch": 0.17376924055463683, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.618881344795227, + "learning_rate": 5.786350148367952e-07, + "loss": 0.5227, + "mean_token_accuracy": 0.8292302489280701, + "num_tokens": 52126716.0, + "step": 1366 + }, + { + "epoch": 0.17389645083322733, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.604249119758606, + "learning_rate": 5.790589232725731e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8470420837402344, + "num_tokens": 52165149.0, + "step": 1367 + }, + { + "epoch": 0.17402366111181783, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.5160787105560303, + "learning_rate": 5.79482831708351e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8537449836730957, + "num_tokens": 52207009.0, + "step": 1368 + }, + { + "epoch": 0.17415087139040836, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.5423407554626465, + "learning_rate": 5.799067401441288e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8439901471138, + "num_tokens": 52250115.0, + "step": 1369 + }, + { + "epoch": 0.17427808166899886, + "ewc_loss": 4.857778549194336e-06, + "grad_norm": 1.4509292840957642, + "learning_rate": 5.803306485799068e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.859598696231842, + "num_tokens": 52292410.0, + "step": 1370 + }, + { + "epoch": 0.17440529194758936, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.7095531225204468, + "learning_rate": 5.807545570156845e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8295606374740601, + "num_tokens": 52328473.0, + "step": 1371 + }, + { + "epoch": 0.1745325022261799, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.650549054145813, + "learning_rate": 5.811784654514624e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8625800609588623, + "num_tokens": 52364234.0, + "step": 1372 + }, + { + "epoch": 0.1746597125047704, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.687541127204895, + "learning_rate": 5.816023738872403e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8452380299568176, + "num_tokens": 52402487.0, + "step": 1373 + }, + { + "epoch": 0.1747869227833609, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.6626513004302979, + "learning_rate": 5.820262823230182e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8564440011978149, + "num_tokens": 52440262.0, + "step": 1374 + }, + { + "epoch": 0.17491413306195142, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.5516680479049683, + "learning_rate": 5.824501907587961e-07, + "loss": 0.4295, + "mean_token_accuracy": 0.8595690727233887, + "num_tokens": 52480251.0, + "step": 1375 + }, + { + "epoch": 0.17504134334054192, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.7011116743087769, + "learning_rate": 5.82874099194574e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.8312692642211914, + "num_tokens": 52516104.0, + "step": 1376 + }, + { + "epoch": 0.17516855361913242, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.6520575284957886, + "learning_rate": 5.832980076303518e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8463426828384399, + "num_tokens": 52556071.0, + "step": 1377 + }, + { + "epoch": 0.17529576389772294, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.6346697807312012, + "learning_rate": 5.837219160661297e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8446407318115234, + "num_tokens": 52590598.0, + "step": 1378 + }, + { + "epoch": 0.17542297417631345, + "ewc_loss": 4.887580871582031e-06, + "grad_norm": 1.6350780725479126, + "learning_rate": 5.841458245019075e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8487341403961182, + "num_tokens": 52627162.0, + "step": 1379 + }, + { + "epoch": 0.17555018445490395, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.5281378030776978, + "learning_rate": 5.845697329376855e-07, + "loss": 0.4053, + "mean_token_accuracy": 0.8669535517692566, + "num_tokens": 52662134.0, + "step": 1380 + }, + { + "epoch": 0.17567739473349447, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.4860124588012695, + "learning_rate": 5.849936413734633e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8521183729171753, + "num_tokens": 52705871.0, + "step": 1381 + }, + { + "epoch": 0.17580460501208497, + "ewc_loss": 4.9173831939697266e-06, + "grad_norm": 1.6742147207260132, + "learning_rate": 5.854175498092412e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8391415476799011, + "num_tokens": 52740141.0, + "step": 1382 + }, + { + "epoch": 0.17593181529067548, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.5426628589630127, + "learning_rate": 5.858414582450191e-07, + "loss": 0.4248, + "mean_token_accuracy": 0.8608721494674683, + "num_tokens": 52780789.0, + "step": 1383 + }, + { + "epoch": 0.176059025569266, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.581992506980896, + "learning_rate": 5.86265366680797e-07, + "loss": 0.4648, + "mean_token_accuracy": 0.8467870354652405, + "num_tokens": 52820828.0, + "step": 1384 + }, + { + "epoch": 0.1761862358478565, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.8264509439468384, + "learning_rate": 5.866892751165748e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8550540208816528, + "num_tokens": 52851778.0, + "step": 1385 + }, + { + "epoch": 0.176313446126447, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.5756316184997559, + "learning_rate": 5.871131835523526e-07, + "loss": 0.515, + "mean_token_accuracy": 0.8330055475234985, + "num_tokens": 52893678.0, + "step": 1386 + }, + { + "epoch": 0.17644065640503753, + "ewc_loss": 4.947185516357422e-06, + "grad_norm": 1.5517330169677734, + "learning_rate": 5.875370919881305e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.8589945435523987, + "num_tokens": 52933288.0, + "step": 1387 + }, + { + "epoch": 0.17656786668362803, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 1.5812053680419922, + "learning_rate": 5.879610004239084e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8591298460960388, + "num_tokens": 52972865.0, + "step": 1388 + }, + { + "epoch": 0.17669507696221853, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.6209604740142822, + "learning_rate": 5.883849088596863e-07, + "loss": 0.5595, + "mean_token_accuracy": 0.8255000710487366, + "num_tokens": 53014833.0, + "step": 1389 + }, + { + "epoch": 0.17682228724080906, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 1.5065118074417114, + "learning_rate": 5.888088172954641e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8468108773231506, + "num_tokens": 53059592.0, + "step": 1390 + }, + { + "epoch": 0.17694949751939956, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 1.6614130735397339, + "learning_rate": 5.892327257312421e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8442637324333191, + "num_tokens": 53093533.0, + "step": 1391 + }, + { + "epoch": 0.1770767077979901, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 1.5659654140472412, + "learning_rate": 5.896566341670199e-07, + "loss": 0.5037, + "mean_token_accuracy": 0.8394393920898438, + "num_tokens": 53137266.0, + "step": 1392 + }, + { + "epoch": 0.1772039180765806, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 1.631332516670227, + "learning_rate": 5.900805426027977e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.848090410232544, + "num_tokens": 53171892.0, + "step": 1393 + }, + { + "epoch": 0.1773311283551711, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 1.6975339651107788, + "learning_rate": 5.905044510385756e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8588541746139526, + "num_tokens": 53205142.0, + "step": 1394 + }, + { + "epoch": 0.17745833863376162, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.5237667560577393, + "learning_rate": 5.909283594743535e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8528527021408081, + "num_tokens": 53245657.0, + "step": 1395 + }, + { + "epoch": 0.17758554891235212, + "ewc_loss": 4.976987838745117e-06, + "grad_norm": 1.6461033821105957, + "learning_rate": 5.913522679101314e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8392974138259888, + "num_tokens": 53284927.0, + "step": 1396 + }, + { + "epoch": 0.17771275919094262, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.7306146621704102, + "learning_rate": 5.917761763459093e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8555024266242981, + "num_tokens": 53318771.0, + "step": 1397 + }, + { + "epoch": 0.17783996946953315, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.4634696245193481, + "learning_rate": 5.922000847816871e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8338215351104736, + "num_tokens": 53364878.0, + "step": 1398 + }, + { + "epoch": 0.17796717974812365, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.61836576461792, + "learning_rate": 5.926239932174651e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8539241552352905, + "num_tokens": 53401916.0, + "step": 1399 + }, + { + "epoch": 0.17809439002671415, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.588817834854126, + "learning_rate": 5.930479016532429e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8469611406326294, + "num_tokens": 53441110.0, + "step": 1400 + }, + { + "epoch": 0.17822160030530468, + "ewc_loss": 5.0067901611328125e-06, + "grad_norm": 1.607774019241333, + "learning_rate": 5.934718100890207e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8363132476806641, + "num_tokens": 53480226.0, + "step": 1401 + }, + { + "epoch": 0.17834881058389518, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.6453524827957153, + "learning_rate": 5.938957185247986e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.84026038646698, + "num_tokens": 53522260.0, + "step": 1402 + }, + { + "epoch": 0.17847602086248568, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.6355431079864502, + "learning_rate": 5.943196269605765e-07, + "loss": 0.4381, + "mean_token_accuracy": 0.8587404489517212, + "num_tokens": 53558664.0, + "step": 1403 + }, + { + "epoch": 0.1786032311410762, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.656701683998108, + "learning_rate": 5.947435353963544e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8568046689033508, + "num_tokens": 53597020.0, + "step": 1404 + }, + { + "epoch": 0.1787304414196667, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.5670791864395142, + "learning_rate": 5.951674438321323e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8475819230079651, + "num_tokens": 53636752.0, + "step": 1405 + }, + { + "epoch": 0.1788576516982572, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.4572087526321411, + "learning_rate": 5.955913522679101e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8589872717857361, + "num_tokens": 53677980.0, + "step": 1406 + }, + { + "epoch": 0.17898486197684774, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.5570435523986816, + "learning_rate": 5.96015260703688e-07, + "loss": 0.449, + "mean_token_accuracy": 0.8527042269706726, + "num_tokens": 53717272.0, + "step": 1407 + }, + { + "epoch": 0.17911207225543824, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.5678372383117676, + "learning_rate": 5.964391691394659e-07, + "loss": 0.436, + "mean_token_accuracy": 0.8571636080741882, + "num_tokens": 53761811.0, + "step": 1408 + }, + { + "epoch": 0.17923928253402874, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.5234229564666748, + "learning_rate": 5.968630775752436e-07, + "loss": 0.5213, + "mean_token_accuracy": 0.8337099552154541, + "num_tokens": 53802684.0, + "step": 1409 + }, + { + "epoch": 0.17936649281261927, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.6027990579605103, + "learning_rate": 5.972869860110216e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8393988609313965, + "num_tokens": 53846219.0, + "step": 1410 + }, + { + "epoch": 0.17949370309120977, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.6297239065170288, + "learning_rate": 5.977108944467994e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8561972379684448, + "num_tokens": 53881174.0, + "step": 1411 + }, + { + "epoch": 0.17962091336980027, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.6586755514144897, + "learning_rate": 5.981348028825774e-07, + "loss": 0.4133, + "mean_token_accuracy": 0.857395350933075, + "num_tokens": 53915665.0, + "step": 1412 + }, + { + "epoch": 0.1797481236483908, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.6787523031234741, + "learning_rate": 5.985587113183552e-07, + "loss": 0.4245, + "mean_token_accuracy": 0.8583648204803467, + "num_tokens": 53950529.0, + "step": 1413 + }, + { + "epoch": 0.1798753339269813, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.590287446975708, + "learning_rate": 5.989826197541331e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8428825736045837, + "num_tokens": 53990010.0, + "step": 1414 + }, + { + "epoch": 0.18000254420557182, + "ewc_loss": 5.036592483520508e-06, + "grad_norm": 1.7266130447387695, + "learning_rate": 5.99406528189911e-07, + "loss": 0.515, + "mean_token_accuracy": 0.8299532532691956, + "num_tokens": 54025506.0, + "step": 1415 + }, + { + "epoch": 0.18012975448416232, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.6847772598266602, + "learning_rate": 5.998304366256888e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8371463418006897, + "num_tokens": 54063246.0, + "step": 1416 + }, + { + "epoch": 0.18025696476275282, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.5974067449569702, + "learning_rate": 6.002543450614666e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.8367782235145569, + "num_tokens": 54102336.0, + "step": 1417 + }, + { + "epoch": 0.18038417504134335, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.6172058582305908, + "learning_rate": 6.006782534972446e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8510940074920654, + "num_tokens": 54141850.0, + "step": 1418 + }, + { + "epoch": 0.18051138531993385, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.5712138414382935, + "learning_rate": 6.011021619330224e-07, + "loss": 0.4419, + "mean_token_accuracy": 0.8502221703529358, + "num_tokens": 54178743.0, + "step": 1419 + }, + { + "epoch": 0.18063859559852435, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.6282116174697876, + "learning_rate": 6.015260703688004e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8587746620178223, + "num_tokens": 54212993.0, + "step": 1420 + }, + { + "epoch": 0.18076580587711488, + "ewc_loss": 5.066394805908203e-06, + "grad_norm": 1.654768466949463, + "learning_rate": 6.019499788045782e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.842867910861969, + "num_tokens": 54252584.0, + "step": 1421 + }, + { + "epoch": 0.18089301615570538, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.713169813156128, + "learning_rate": 6.023738872403561e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8352801203727722, + "num_tokens": 54290212.0, + "step": 1422 + }, + { + "epoch": 0.18102022643429588, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.8118276596069336, + "learning_rate": 6.02797795676134e-07, + "loss": 0.5243, + "mean_token_accuracy": 0.8305468559265137, + "num_tokens": 54323777.0, + "step": 1423 + }, + { + "epoch": 0.1811474367128864, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.7993429899215698, + "learning_rate": 6.032217041119118e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8430138826370239, + "num_tokens": 54361809.0, + "step": 1424 + }, + { + "epoch": 0.1812746469914769, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.5718779563903809, + "learning_rate": 6.036456125476896e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8480669260025024, + "num_tokens": 54400611.0, + "step": 1425 + }, + { + "epoch": 0.1814018572700674, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.4526804685592651, + "learning_rate": 6.040695209834675e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8588022589683533, + "num_tokens": 54444399.0, + "step": 1426 + }, + { + "epoch": 0.18152906754865794, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.6266337633132935, + "learning_rate": 6.044934294192454e-07, + "loss": 0.4952, + "mean_token_accuracy": 0.8408254981040955, + "num_tokens": 54484834.0, + "step": 1427 + }, + { + "epoch": 0.18165627782724844, + "ewc_loss": 5.125999450683594e-06, + "grad_norm": 1.6806951761245728, + "learning_rate": 6.049173378550233e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.846664547920227, + "num_tokens": 54524539.0, + "step": 1428 + }, + { + "epoch": 0.18178348810583894, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.4781677722930908, + "learning_rate": 6.053412462908012e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.8459190130233765, + "num_tokens": 54567337.0, + "step": 1429 + }, + { + "epoch": 0.18191069838442947, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.6576684713363647, + "learning_rate": 6.05765154726579e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8472689390182495, + "num_tokens": 54605568.0, + "step": 1430 + }, + { + "epoch": 0.18203790866301997, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.6420161724090576, + "learning_rate": 6.061890631623569e-07, + "loss": 0.5101, + "mean_token_accuracy": 0.8332729935646057, + "num_tokens": 54644502.0, + "step": 1431 + }, + { + "epoch": 0.18216511894161047, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.5049577951431274, + "learning_rate": 6.066129715981347e-07, + "loss": 0.3899, + "mean_token_accuracy": 0.8699245452880859, + "num_tokens": 54682290.0, + "step": 1432 + }, + { + "epoch": 0.182292329220201, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.5490853786468506, + "learning_rate": 6.070368800339126e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8509107828140259, + "num_tokens": 54725021.0, + "step": 1433 + }, + { + "epoch": 0.1824195394987915, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.659505009651184, + "learning_rate": 6.074607884696905e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8562653064727783, + "num_tokens": 54762524.0, + "step": 1434 + }, + { + "epoch": 0.182546749777382, + "ewc_loss": 5.155801773071289e-06, + "grad_norm": 1.6094471216201782, + "learning_rate": 6.078846969054684e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.8520171046257019, + "num_tokens": 54802125.0, + "step": 1435 + }, + { + "epoch": 0.18267396005597253, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.4648491144180298, + "learning_rate": 6.083086053412463e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8471983671188354, + "num_tokens": 54848414.0, + "step": 1436 + }, + { + "epoch": 0.18280117033456303, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.8287289142608643, + "learning_rate": 6.087325137770242e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8453955054283142, + "num_tokens": 54879231.0, + "step": 1437 + }, + { + "epoch": 0.18292838061315353, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.5185396671295166, + "learning_rate": 6.09156422212802e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8546090722084045, + "num_tokens": 54922277.0, + "step": 1438 + }, + { + "epoch": 0.18305559089174406, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.6895147562026978, + "learning_rate": 6.095803306485799e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.8281083106994629, + "num_tokens": 54960332.0, + "step": 1439 + }, + { + "epoch": 0.18318280117033456, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.6825520992279053, + "learning_rate": 6.100042390843577e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8516096472740173, + "num_tokens": 54995007.0, + "step": 1440 + }, + { + "epoch": 0.1833100114489251, + "ewc_loss": 5.185604095458984e-06, + "grad_norm": 1.4838886260986328, + "learning_rate": 6.104281475201356e-07, + "loss": 0.4546, + "mean_token_accuracy": 0.8464703559875488, + "num_tokens": 55038953.0, + "step": 1441 + }, + { + "epoch": 0.1834372217275156, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.5346894264221191, + "learning_rate": 6.108520559559135e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8343218564987183, + "num_tokens": 55081550.0, + "step": 1442 + }, + { + "epoch": 0.1835644320061061, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.809996247291565, + "learning_rate": 6.112759643916914e-07, + "loss": 0.5293, + "mean_token_accuracy": 0.8307143449783325, + "num_tokens": 55117100.0, + "step": 1443 + }, + { + "epoch": 0.18369164228469662, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.7030220031738281, + "learning_rate": 6.116998728274693e-07, + "loss": 0.533, + "mean_token_accuracy": 0.8271352052688599, + "num_tokens": 55157105.0, + "step": 1444 + }, + { + "epoch": 0.18381885256328712, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.6011807918548584, + "learning_rate": 6.121237812632472e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8488059043884277, + "num_tokens": 55196534.0, + "step": 1445 + }, + { + "epoch": 0.18394606284187762, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.5403335094451904, + "learning_rate": 6.125476896990249e-07, + "loss": 0.3921, + "mean_token_accuracy": 0.8698729276657104, + "num_tokens": 55231245.0, + "step": 1446 + }, + { + "epoch": 0.18407327312046814, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.600556492805481, + "learning_rate": 6.129715981348028e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.837763786315918, + "num_tokens": 55273199.0, + "step": 1447 + }, + { + "epoch": 0.18420048339905865, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.490922451019287, + "learning_rate": 6.133955065705807e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8444031476974487, + "num_tokens": 55319765.0, + "step": 1448 + }, + { + "epoch": 0.18432769367764915, + "ewc_loss": 5.21540641784668e-06, + "grad_norm": 1.7179921865463257, + "learning_rate": 6.138194150063585e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8420165777206421, + "num_tokens": 55356647.0, + "step": 1449 + }, + { + "epoch": 0.18445490395623967, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.5626736879348755, + "learning_rate": 6.142433234421365e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8607907295227051, + "num_tokens": 55400025.0, + "step": 1450 + }, + { + "epoch": 0.18458211423483017, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.5230226516723633, + "learning_rate": 6.146672318779143e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8490031361579895, + "num_tokens": 55441558.0, + "step": 1451 + }, + { + "epoch": 0.18470932451342068, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.561914086341858, + "learning_rate": 6.150911403136923e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8562023639678955, + "num_tokens": 55482691.0, + "step": 1452 + }, + { + "epoch": 0.1848365347920112, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.5604766607284546, + "learning_rate": 6.155150487494701e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.837851345539093, + "num_tokens": 55525193.0, + "step": 1453 + }, + { + "epoch": 0.1849637450706017, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.6616851091384888, + "learning_rate": 6.159389571852479e-07, + "loss": 0.4015, + "mean_token_accuracy": 0.8689659833908081, + "num_tokens": 55561084.0, + "step": 1454 + }, + { + "epoch": 0.1850909553491922, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.505424976348877, + "learning_rate": 6.163628656210258e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8458125591278076, + "num_tokens": 55604049.0, + "step": 1455 + }, + { + "epoch": 0.18521816562778273, + "ewc_loss": 5.245208740234375e-06, + "grad_norm": 1.606525182723999, + "learning_rate": 6.167867740568037e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8517481088638306, + "num_tokens": 55641557.0, + "step": 1456 + }, + { + "epoch": 0.18534537590637323, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.6746902465820312, + "learning_rate": 6.172106824925815e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8462123870849609, + "num_tokens": 55679808.0, + "step": 1457 + }, + { + "epoch": 0.18547258618496373, + "ewc_loss": 5.27501106262207e-06, + "grad_norm": 1.4982706308364868, + "learning_rate": 6.176345909283595e-07, + "loss": 0.4429, + "mean_token_accuracy": 0.8587610721588135, + "num_tokens": 55723219.0, + "step": 1458 + }, + { + "epoch": 0.18559979646355426, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.6031367778778076, + "learning_rate": 6.180584993641373e-07, + "loss": 0.5374, + "mean_token_accuracy": 0.8247283101081848, + "num_tokens": 55764644.0, + "step": 1459 + }, + { + "epoch": 0.18572700674214476, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.7449893951416016, + "learning_rate": 6.184824077999153e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8596818447113037, + "num_tokens": 55797789.0, + "step": 1460 + }, + { + "epoch": 0.18585421702073526, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5559518337249756, + "learning_rate": 6.189063162356931e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8564242124557495, + "num_tokens": 55837281.0, + "step": 1461 + }, + { + "epoch": 0.1859814272993258, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.6242823600769043, + "learning_rate": 6.193302246714709e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8331276178359985, + "num_tokens": 55878629.0, + "step": 1462 + }, + { + "epoch": 0.1861086375779163, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5410728454589844, + "learning_rate": 6.197541331072488e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8502168655395508, + "num_tokens": 55918664.0, + "step": 1463 + }, + { + "epoch": 0.1862358478565068, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5797114372253418, + "learning_rate": 6.201780415430267e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8550184965133667, + "num_tokens": 55957869.0, + "step": 1464 + }, + { + "epoch": 0.18636305813509732, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.4644297361373901, + "learning_rate": 6.206019499788045e-07, + "loss": 0.4089, + "mean_token_accuracy": 0.8660962581634521, + "num_tokens": 55997685.0, + "step": 1465 + }, + { + "epoch": 0.18649026841368782, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5834832191467285, + "learning_rate": 6.210258584145825e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.843879222869873, + "num_tokens": 56035985.0, + "step": 1466 + }, + { + "epoch": 0.18661747869227835, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5621169805526733, + "learning_rate": 6.214497668503603e-07, + "loss": 0.4607, + "mean_token_accuracy": 0.8446913957595825, + "num_tokens": 56074801.0, + "step": 1467 + }, + { + "epoch": 0.18674468897086885, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5488086938858032, + "learning_rate": 6.218736752861383e-07, + "loss": 0.4387, + "mean_token_accuracy": 0.8536882400512695, + "num_tokens": 56113267.0, + "step": 1468 + }, + { + "epoch": 0.18687189924945935, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.6569098234176636, + "learning_rate": 6.22297583721916e-07, + "loss": 0.4429, + "mean_token_accuracy": 0.8541185855865479, + "num_tokens": 56152217.0, + "step": 1469 + }, + { + "epoch": 0.18699910952804988, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5447417497634888, + "learning_rate": 6.227214921576938e-07, + "loss": 0.4241, + "mean_token_accuracy": 0.8611178398132324, + "num_tokens": 56192252.0, + "step": 1470 + }, + { + "epoch": 0.18712631980664038, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.6123907566070557, + "learning_rate": 6.231454005934718e-07, + "loss": 0.421, + "mean_token_accuracy": 0.8604702949523926, + "num_tokens": 56228745.0, + "step": 1471 + }, + { + "epoch": 0.18725353008523088, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5603018999099731, + "learning_rate": 6.235693090292496e-07, + "loss": 0.4041, + "mean_token_accuracy": 0.8644362092018127, + "num_tokens": 56268465.0, + "step": 1472 + }, + { + "epoch": 0.1873807403638214, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.8034650087356567, + "learning_rate": 6.239932174650275e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.8354556560516357, + "num_tokens": 56305819.0, + "step": 1473 + }, + { + "epoch": 0.1875079506424119, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.7920721769332886, + "learning_rate": 6.244171259008054e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.837499737739563, + "num_tokens": 56338678.0, + "step": 1474 + }, + { + "epoch": 0.1876351609210024, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.7572612762451172, + "learning_rate": 6.248410343365833e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8556552529335022, + "num_tokens": 56375279.0, + "step": 1475 + }, + { + "epoch": 0.18776237119959294, + "ewc_loss": 5.304813385009766e-06, + "grad_norm": 1.5869264602661133, + "learning_rate": 6.252649427723612e-07, + "loss": 0.4473, + "mean_token_accuracy": 0.8526239991188049, + "num_tokens": 56415600.0, + "step": 1476 + }, + { + "epoch": 0.18788958147818344, + "ewc_loss": 5.364418029785156e-06, + "grad_norm": 1.6261266469955444, + "learning_rate": 6.25688851208139e-07, + "loss": 0.4431, + "mean_token_accuracy": 0.8555888533592224, + "num_tokens": 56453195.0, + "step": 1477 + }, + { + "epoch": 0.18801679175677394, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.5319746732711792, + "learning_rate": 6.261127596439168e-07, + "loss": 0.5326, + "mean_token_accuracy": 0.8273317217826843, + "num_tokens": 56500752.0, + "step": 1478 + }, + { + "epoch": 0.18814400203536447, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.4376336336135864, + "learning_rate": 6.265366680796948e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8452349901199341, + "num_tokens": 56546970.0, + "step": 1479 + }, + { + "epoch": 0.18827121231395497, + "ewc_loss": 5.334615707397461e-06, + "grad_norm": 1.5843099355697632, + "learning_rate": 6.269605765154726e-07, + "loss": 0.5062, + "mean_token_accuracy": 0.8381304740905762, + "num_tokens": 56584996.0, + "step": 1480 + }, + { + "epoch": 0.18839842259254547, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.618299961090088, + "learning_rate": 6.273844849512505e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8548591732978821, + "num_tokens": 56622665.0, + "step": 1481 + }, + { + "epoch": 0.188525632871136, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.6902204751968384, + "learning_rate": 6.278083933870284e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8539519309997559, + "num_tokens": 56658201.0, + "step": 1482 + }, + { + "epoch": 0.1886528431497265, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.6062120199203491, + "learning_rate": 6.282323018228063e-07, + "loss": 0.4337, + "mean_token_accuracy": 0.8561263680458069, + "num_tokens": 56694587.0, + "step": 1483 + }, + { + "epoch": 0.188780053428317, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.723219871520996, + "learning_rate": 6.286562102585841e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8434011936187744, + "num_tokens": 56727336.0, + "step": 1484 + }, + { + "epoch": 0.18890726370690752, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.5830020904541016, + "learning_rate": 6.29080118694362e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8276494145393372, + "num_tokens": 56773772.0, + "step": 1485 + }, + { + "epoch": 0.18903447398549802, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.602837085723877, + "learning_rate": 6.295040271301398e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.849073588848114, + "num_tokens": 56815328.0, + "step": 1486 + }, + { + "epoch": 0.18916168426408853, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.5769143104553223, + "learning_rate": 6.299279355659178e-07, + "loss": 0.3972, + "mean_token_accuracy": 0.8639364242553711, + "num_tokens": 56850819.0, + "step": 1487 + }, + { + "epoch": 0.18928889454267905, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.6030365228652954, + "learning_rate": 6.303518440016956e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8403118252754211, + "num_tokens": 56893096.0, + "step": 1488 + }, + { + "epoch": 0.18941610482126955, + "ewc_loss": 5.3942203521728516e-06, + "grad_norm": 1.6598458290100098, + "learning_rate": 6.307757524374735e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8433176279067993, + "num_tokens": 56927550.0, + "step": 1489 + }, + { + "epoch": 0.18954331509986008, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.7185436487197876, + "learning_rate": 6.311996608732514e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8380976319313049, + "num_tokens": 56966305.0, + "step": 1490 + }, + { + "epoch": 0.18967052537845058, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.7370243072509766, + "learning_rate": 6.316235693090292e-07, + "loss": 0.4606, + "mean_token_accuracy": 0.8485453128814697, + "num_tokens": 57003654.0, + "step": 1491 + }, + { + "epoch": 0.18979773565704108, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.5498615503311157, + "learning_rate": 6.320474777448071e-07, + "loss": 0.4452, + "mean_token_accuracy": 0.8537393808364868, + "num_tokens": 57041592.0, + "step": 1492 + }, + { + "epoch": 0.1899249459356316, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.6061946153640747, + "learning_rate": 6.324713861805849e-07, + "loss": 0.4261, + "mean_token_accuracy": 0.8583647608757019, + "num_tokens": 57077053.0, + "step": 1493 + }, + { + "epoch": 0.1900521562142221, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.5335744619369507, + "learning_rate": 6.328952946163628e-07, + "loss": 0.441, + "mean_token_accuracy": 0.8537291288375854, + "num_tokens": 57116713.0, + "step": 1494 + }, + { + "epoch": 0.1901793664928126, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.6650776863098145, + "learning_rate": 6.333192030521407e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8530620336532593, + "num_tokens": 57158019.0, + "step": 1495 + }, + { + "epoch": 0.19030657677140314, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.6070079803466797, + "learning_rate": 6.337431114879186e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8496732711791992, + "num_tokens": 57195911.0, + "step": 1496 + }, + { + "epoch": 0.19043378704999364, + "ewc_loss": 5.453824996948242e-06, + "grad_norm": 1.6836525201797485, + "learning_rate": 6.341670199236965e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8307716846466064, + "num_tokens": 57233255.0, + "step": 1497 + }, + { + "epoch": 0.19056099732858414, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.4784679412841797, + "learning_rate": 6.345909283594744e-07, + "loss": 0.3939, + "mean_token_accuracy": 0.8698412179946899, + "num_tokens": 57272890.0, + "step": 1498 + }, + { + "epoch": 0.19068820760717467, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.5779683589935303, + "learning_rate": 6.350148367952522e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8474928140640259, + "num_tokens": 57312918.0, + "step": 1499 + }, + { + "epoch": 0.19081541788576517, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.5471892356872559, + "learning_rate": 6.354387452310301e-07, + "loss": 0.4289, + "mean_token_accuracy": 0.8630579113960266, + "num_tokens": 57350614.0, + "step": 1500 + }, + { + "epoch": 0.19094262816435567, + "ewc_loss": 5.424022674560547e-06, + "grad_norm": 1.5851556062698364, + "learning_rate": 6.358626536668079e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8536034822463989, + "num_tokens": 57385336.0, + "step": 1501 + }, + { + "epoch": 0.1910698384429462, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.525179386138916, + "learning_rate": 6.362865621025858e-07, + "loss": 0.4169, + "mean_token_accuracy": 0.8620074987411499, + "num_tokens": 57424859.0, + "step": 1502 + }, + { + "epoch": 0.1911970487215367, + "ewc_loss": 5.4836273193359375e-06, + "grad_norm": 1.6892704963684082, + "learning_rate": 6.367104705383637e-07, + "loss": 0.4098, + "mean_token_accuracy": 0.8665154576301575, + "num_tokens": 57458323.0, + "step": 1503 + }, + { + "epoch": 0.1913242590001272, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.642876148223877, + "learning_rate": 6.371343789741416e-07, + "loss": 0.4677, + "mean_token_accuracy": 0.8433696031570435, + "num_tokens": 57492610.0, + "step": 1504 + }, + { + "epoch": 0.19145146927871773, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.3869322538375854, + "learning_rate": 6.375582874099195e-07, + "loss": 0.4417, + "mean_token_accuracy": 0.8556957244873047, + "num_tokens": 57538444.0, + "step": 1505 + }, + { + "epoch": 0.19157867955730823, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.7121068239212036, + "learning_rate": 6.379821958456974e-07, + "loss": 0.5337, + "mean_token_accuracy": 0.829365611076355, + "num_tokens": 57574866.0, + "step": 1506 + }, + { + "epoch": 0.19170588983589873, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.577881097793579, + "learning_rate": 6.384061042814751e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8415637016296387, + "num_tokens": 57616019.0, + "step": 1507 + }, + { + "epoch": 0.19183310011448926, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.9391131401062012, + "learning_rate": 6.38830012717253e-07, + "loss": 0.4794, + "mean_token_accuracy": 0.8431186676025391, + "num_tokens": 57652901.0, + "step": 1508 + }, + { + "epoch": 0.19196031039307976, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.8831658363342285, + "learning_rate": 6.392539211530309e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8388084173202515, + "num_tokens": 57683234.0, + "step": 1509 + }, + { + "epoch": 0.19208752067167026, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.669604778289795, + "learning_rate": 6.396778295888087e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8508759140968323, + "num_tokens": 57718967.0, + "step": 1510 + }, + { + "epoch": 0.1922147309502608, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.6373990774154663, + "learning_rate": 6.401017380245867e-07, + "loss": 0.4215, + "mean_token_accuracy": 0.8617022037506104, + "num_tokens": 57753415.0, + "step": 1511 + }, + { + "epoch": 0.1923419412288513, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.568011999130249, + "learning_rate": 6.405256464603645e-07, + "loss": 0.4261, + "mean_token_accuracy": 0.8609452247619629, + "num_tokens": 57789325.0, + "step": 1512 + }, + { + "epoch": 0.1924691515074418, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.6801201105117798, + "learning_rate": 6.409495548961425e-07, + "loss": 0.4168, + "mean_token_accuracy": 0.8612701892852783, + "num_tokens": 57822235.0, + "step": 1513 + }, + { + "epoch": 0.19259636178603232, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.546322226524353, + "learning_rate": 6.413734633319203e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8396251201629639, + "num_tokens": 57863684.0, + "step": 1514 + }, + { + "epoch": 0.19272357206462282, + "ewc_loss": 5.513429641723633e-06, + "grad_norm": 1.7494186162948608, + "learning_rate": 6.417973717676981e-07, + "loss": 0.4178, + "mean_token_accuracy": 0.8616691827774048, + "num_tokens": 57897001.0, + "step": 1515 + }, + { + "epoch": 0.19285078234321335, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.6626675128936768, + "learning_rate": 6.42221280203476e-07, + "loss": 0.4471, + "mean_token_accuracy": 0.8515052199363708, + "num_tokens": 57932597.0, + "step": 1516 + }, + { + "epoch": 0.19297799262180385, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.4887639284133911, + "learning_rate": 6.426451886392539e-07, + "loss": 0.4103, + "mean_token_accuracy": 0.8638315200805664, + "num_tokens": 57976576.0, + "step": 1517 + }, + { + "epoch": 0.19310520290039435, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.6869621276855469, + "learning_rate": 6.430690970750317e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8379265069961548, + "num_tokens": 58015288.0, + "step": 1518 + }, + { + "epoch": 0.19323241317898487, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.5580716133117676, + "learning_rate": 6.434930055108097e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8489132523536682, + "num_tokens": 58055536.0, + "step": 1519 + }, + { + "epoch": 0.19335962345757537, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.569740653038025, + "learning_rate": 6.439169139465875e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8434780240058899, + "num_tokens": 58096534.0, + "step": 1520 + }, + { + "epoch": 0.19348683373616588, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.6490252017974854, + "learning_rate": 6.443408223823655e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8447237014770508, + "num_tokens": 58134205.0, + "step": 1521 + }, + { + "epoch": 0.1936140440147564, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.5548977851867676, + "learning_rate": 6.447647308181432e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.852549135684967, + "num_tokens": 58173489.0, + "step": 1522 + }, + { + "epoch": 0.1937412542933469, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.6580039262771606, + "learning_rate": 6.451886392539211e-07, + "loss": 0.4435, + "mean_token_accuracy": 0.8577184677124023, + "num_tokens": 58212386.0, + "step": 1523 + }, + { + "epoch": 0.1938684645719374, + "ewc_loss": 5.543231964111328e-06, + "grad_norm": 1.5684734582901, + "learning_rate": 6.45612547689699e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8438389301300049, + "num_tokens": 58254564.0, + "step": 1524 + }, + { + "epoch": 0.19399567485052793, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.634505033493042, + "learning_rate": 6.460364561254769e-07, + "loss": 0.4258, + "mean_token_accuracy": 0.8583325147628784, + "num_tokens": 58289217.0, + "step": 1525 + }, + { + "epoch": 0.19412288512911843, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.6967483758926392, + "learning_rate": 6.464603645612547e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.85302734375, + "num_tokens": 58324329.0, + "step": 1526 + }, + { + "epoch": 0.19425009540770893, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.614296555519104, + "learning_rate": 6.468842729970327e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.8539882898330688, + "num_tokens": 58360319.0, + "step": 1527 + }, + { + "epoch": 0.19437730568629946, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.5912866592407227, + "learning_rate": 6.473081814328105e-07, + "loss": 0.4613, + "mean_token_accuracy": 0.8459653854370117, + "num_tokens": 58396850.0, + "step": 1528 + }, + { + "epoch": 0.19450451596488996, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.514003872871399, + "learning_rate": 6.477320898685885e-07, + "loss": 0.4234, + "mean_token_accuracy": 0.8601991534233093, + "num_tokens": 58439151.0, + "step": 1529 + }, + { + "epoch": 0.19463172624348046, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.5822510719299316, + "learning_rate": 6.481559983043662e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8494062423706055, + "num_tokens": 58479074.0, + "step": 1530 + }, + { + "epoch": 0.194758936522071, + "ewc_loss": 5.5730342864990234e-06, + "grad_norm": 1.4711133241653442, + "learning_rate": 6.48579906740144e-07, + "loss": 0.3895, + "mean_token_accuracy": 0.8665275573730469, + "num_tokens": 58520216.0, + "step": 1531 + }, + { + "epoch": 0.1948861468006615, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.6794897317886353, + "learning_rate": 6.49003815175922e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8468063473701477, + "num_tokens": 58550934.0, + "step": 1532 + }, + { + "epoch": 0.195013357079252, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.6429626941680908, + "learning_rate": 6.494277236116998e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.8382718563079834, + "num_tokens": 58589028.0, + "step": 1533 + }, + { + "epoch": 0.19514056735784252, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.4472137689590454, + "learning_rate": 6.498516320474777e-07, + "loss": 0.4246, + "mean_token_accuracy": 0.8610737323760986, + "num_tokens": 58634491.0, + "step": 1534 + }, + { + "epoch": 0.19526777763643302, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.5346564054489136, + "learning_rate": 6.502755404832556e-07, + "loss": 0.3991, + "mean_token_accuracy": 0.8708027005195618, + "num_tokens": 58671687.0, + "step": 1535 + }, + { + "epoch": 0.19539498791502352, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.60231351852417, + "learning_rate": 6.506994489190335e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8580742478370667, + "num_tokens": 58710509.0, + "step": 1536 + }, + { + "epoch": 0.19552219819361405, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.6759912967681885, + "learning_rate": 6.511233573548114e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8508705496788025, + "num_tokens": 58745860.0, + "step": 1537 + }, + { + "epoch": 0.19564940847220455, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.527100920677185, + "learning_rate": 6.515472657905892e-07, + "loss": 0.4235, + "mean_token_accuracy": 0.8587772846221924, + "num_tokens": 58785908.0, + "step": 1538 + }, + { + "epoch": 0.19577661875079505, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.5786465406417847, + "learning_rate": 6.51971174226367e-07, + "loss": 0.4257, + "mean_token_accuracy": 0.8565787076950073, + "num_tokens": 58823906.0, + "step": 1539 + }, + { + "epoch": 0.19590382902938558, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.6523579359054565, + "learning_rate": 6.52395082662145e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8364348411560059, + "num_tokens": 58862998.0, + "step": 1540 + }, + { + "epoch": 0.19603103930797608, + "ewc_loss": 5.602836608886719e-06, + "grad_norm": 1.8544135093688965, + "learning_rate": 6.528189910979228e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8454421758651733, + "num_tokens": 58896544.0, + "step": 1541 + }, + { + "epoch": 0.1961582495865666, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.568179965019226, + "learning_rate": 6.532428995337007e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8503119349479675, + "num_tokens": 58934410.0, + "step": 1542 + }, + { + "epoch": 0.1962854598651571, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.8557463884353638, + "learning_rate": 6.536668079694786e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8552408218383789, + "num_tokens": 58967330.0, + "step": 1543 + }, + { + "epoch": 0.1964126701437476, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.5265944004058838, + "learning_rate": 6.540907164052565e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8609445095062256, + "num_tokens": 59007134.0, + "step": 1544 + }, + { + "epoch": 0.19653988042233814, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.6792423725128174, + "learning_rate": 6.545146248410343e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8538246750831604, + "num_tokens": 59044418.0, + "step": 1545 + }, + { + "epoch": 0.19666709070092864, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.5384395122528076, + "learning_rate": 6.549385332768122e-07, + "loss": 0.4071, + "mean_token_accuracy": 0.8648402094841003, + "num_tokens": 59081444.0, + "step": 1546 + }, + { + "epoch": 0.19679430097951914, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.5703176259994507, + "learning_rate": 6.5536244171259e-07, + "loss": 0.4249, + "mean_token_accuracy": 0.8626092672348022, + "num_tokens": 59120955.0, + "step": 1547 + }, + { + "epoch": 0.19692151125810967, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.653429627418518, + "learning_rate": 6.55786350148368e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8371716737747192, + "num_tokens": 59159287.0, + "step": 1548 + }, + { + "epoch": 0.19704872153670017, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.600184679031372, + "learning_rate": 6.562102585841458e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8518885374069214, + "num_tokens": 59197134.0, + "step": 1549 + }, + { + "epoch": 0.19717593181529067, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.5779547691345215, + "learning_rate": 6.566341670199236e-07, + "loss": 0.4073, + "mean_token_accuracy": 0.8630796074867249, + "num_tokens": 59231269.0, + "step": 1550 + }, + { + "epoch": 0.1973031420938812, + "ewc_loss": 5.632638931274414e-06, + "grad_norm": 1.685532569885254, + "learning_rate": 6.570580754557016e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8473852872848511, + "num_tokens": 59268311.0, + "step": 1551 + }, + { + "epoch": 0.1974303523724717, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.6460238695144653, + "learning_rate": 6.574819838914794e-07, + "loss": 0.4687, + "mean_token_accuracy": 0.8450912833213806, + "num_tokens": 59303788.0, + "step": 1552 + }, + { + "epoch": 0.1975575626510622, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.5452476739883423, + "learning_rate": 6.579058923272573e-07, + "loss": 0.5088, + "mean_token_accuracy": 0.8384154438972473, + "num_tokens": 59346425.0, + "step": 1553 + }, + { + "epoch": 0.19768477292965272, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.8165112733840942, + "learning_rate": 6.583298007630351e-07, + "loss": 0.457, + "mean_token_accuracy": 0.8508802652359009, + "num_tokens": 59379296.0, + "step": 1554 + }, + { + "epoch": 0.19781198320824323, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.5469475984573364, + "learning_rate": 6.58753709198813e-07, + "loss": 0.4314, + "mean_token_accuracy": 0.8585806488990784, + "num_tokens": 59416891.0, + "step": 1555 + }, + { + "epoch": 0.19793919348683373, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.5737923383712769, + "learning_rate": 6.591776176345909e-07, + "loss": 0.4143, + "mean_token_accuracy": 0.8605707883834839, + "num_tokens": 59452426.0, + "step": 1556 + }, + { + "epoch": 0.19806640376542425, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.718359112739563, + "learning_rate": 6.596015260703688e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8595930337905884, + "num_tokens": 59486563.0, + "step": 1557 + }, + { + "epoch": 0.19819361404401475, + "ewc_loss": 5.662441253662109e-06, + "grad_norm": 1.6042808294296265, + "learning_rate": 6.600254345061466e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8551934361457825, + "num_tokens": 59521695.0, + "step": 1558 + }, + { + "epoch": 0.19832082432260525, + "ewc_loss": 5.692243576049805e-06, + "grad_norm": 1.5924396514892578, + "learning_rate": 6.604493429419246e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8306567668914795, + "num_tokens": 59563894.0, + "step": 1559 + }, + { + "epoch": 0.19844803460119578, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.5712060928344727, + "learning_rate": 6.608732513777023e-07, + "loss": 0.4418, + "mean_token_accuracy": 0.851035475730896, + "num_tokens": 59601623.0, + "step": 1560 + }, + { + "epoch": 0.19857524487978628, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.6588186025619507, + "learning_rate": 6.612971598134803e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8612471222877502, + "num_tokens": 59636323.0, + "step": 1561 + }, + { + "epoch": 0.19870245515837678, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.7316827774047852, + "learning_rate": 6.617210682492581e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8496615886688232, + "num_tokens": 59674449.0, + "step": 1562 + }, + { + "epoch": 0.1988296654369673, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.596758484840393, + "learning_rate": 6.62144976685036e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8460879325866699, + "num_tokens": 59716708.0, + "step": 1563 + }, + { + "epoch": 0.1989568757155578, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.6343693733215332, + "learning_rate": 6.625688851208139e-07, + "loss": 0.4492, + "mean_token_accuracy": 0.8513385057449341, + "num_tokens": 59752578.0, + "step": 1564 + }, + { + "epoch": 0.19908408599414834, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.7066596746444702, + "learning_rate": 6.629927935565918e-07, + "loss": 0.4216, + "mean_token_accuracy": 0.8587469458580017, + "num_tokens": 59787916.0, + "step": 1565 + }, + { + "epoch": 0.19921129627273884, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.7785218954086304, + "learning_rate": 6.634167019923696e-07, + "loss": 0.4298, + "mean_token_accuracy": 0.8579353094100952, + "num_tokens": 59824122.0, + "step": 1566 + }, + { + "epoch": 0.19933850655132934, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.5224101543426514, + "learning_rate": 6.638406104281476e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8586891889572144, + "num_tokens": 59862782.0, + "step": 1567 + }, + { + "epoch": 0.19946571682991987, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.5603859424591064, + "learning_rate": 6.642645188639253e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8531850576400757, + "num_tokens": 59902163.0, + "step": 1568 + }, + { + "epoch": 0.19959292710851037, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.7389395236968994, + "learning_rate": 6.646884272997032e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8444654941558838, + "num_tokens": 59937881.0, + "step": 1569 + }, + { + "epoch": 0.19972013738710087, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.6239749193191528, + "learning_rate": 6.651123357354811e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8445705771446228, + "num_tokens": 59975190.0, + "step": 1570 + }, + { + "epoch": 0.1998473476656914, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.5101306438446045, + "learning_rate": 6.655362441712589e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8482520580291748, + "num_tokens": 60020177.0, + "step": 1571 + }, + { + "epoch": 0.1999745579442819, + "ewc_loss": 5.7220458984375e-06, + "grad_norm": 1.7066892385482788, + "learning_rate": 6.659601526070369e-07, + "loss": 0.4359, + "mean_token_accuracy": 0.854079008102417, + "num_tokens": 60054290.0, + "step": 1572 + }, + { + "epoch": 0.2001017682228724, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7296509742736816, + "learning_rate": 6.663840610428147e-07, + "loss": 0.4204, + "mean_token_accuracy": 0.8628086447715759, + "num_tokens": 60087330.0, + "step": 1573 + }, + { + "epoch": 0.20022897850146293, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.6870328187942505, + "learning_rate": 6.668079694785926e-07, + "loss": 0.4129, + "mean_token_accuracy": 0.8626590967178345, + "num_tokens": 60124773.0, + "step": 1574 + }, + { + "epoch": 0.20035618878005343, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.5716758966445923, + "learning_rate": 6.672318779143704e-07, + "loss": 0.4241, + "mean_token_accuracy": 0.8601424694061279, + "num_tokens": 60162021.0, + "step": 1575 + }, + { + "epoch": 0.20048339905864393, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.6049528121948242, + "learning_rate": 6.676557863501483e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8510828018188477, + "num_tokens": 60200660.0, + "step": 1576 + }, + { + "epoch": 0.20061060933723446, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.6414905786514282, + "learning_rate": 6.680796947859262e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.856712818145752, + "num_tokens": 60235622.0, + "step": 1577 + }, + { + "epoch": 0.20073781961582496, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.60954749584198, + "learning_rate": 6.685036032217041e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8420318365097046, + "num_tokens": 60278063.0, + "step": 1578 + }, + { + "epoch": 0.20086502989441546, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.5775421857833862, + "learning_rate": 6.689275116574819e-07, + "loss": 0.3908, + "mean_token_accuracy": 0.8680142164230347, + "num_tokens": 60311374.0, + "step": 1579 + }, + { + "epoch": 0.200992240173006, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.654835820198059, + "learning_rate": 6.693514200932599e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8305957317352295, + "num_tokens": 60354011.0, + "step": 1580 + }, + { + "epoch": 0.2011194504515965, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.5827996730804443, + "learning_rate": 6.697753285290377e-07, + "loss": 0.4935, + "mean_token_accuracy": 0.8419041633605957, + "num_tokens": 60395280.0, + "step": 1581 + }, + { + "epoch": 0.201246660730187, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.7448508739471436, + "learning_rate": 6.701992369648156e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.8552011251449585, + "num_tokens": 60427641.0, + "step": 1582 + }, + { + "epoch": 0.20137387100877752, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.654495120048523, + "learning_rate": 6.706231454005934e-07, + "loss": 0.4458, + "mean_token_accuracy": 0.8548741340637207, + "num_tokens": 60463083.0, + "step": 1583 + }, + { + "epoch": 0.20150108128736802, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.5686200857162476, + "learning_rate": 6.710470538363713e-07, + "loss": 0.4563, + "mean_token_accuracy": 0.8502556681632996, + "num_tokens": 60500710.0, + "step": 1584 + }, + { + "epoch": 0.20162829156595852, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.642106056213379, + "learning_rate": 6.714709622721492e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8573213815689087, + "num_tokens": 60538597.0, + "step": 1585 + }, + { + "epoch": 0.20175550184454905, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.6670957803726196, + "learning_rate": 6.718948707079271e-07, + "loss": 0.4376, + "mean_token_accuracy": 0.855250358581543, + "num_tokens": 60571706.0, + "step": 1586 + }, + { + "epoch": 0.20188271212313955, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.6984699964523315, + "learning_rate": 6.723187791437049e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8344694972038269, + "num_tokens": 60611648.0, + "step": 1587 + }, + { + "epoch": 0.20200992240173005, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.6236066818237305, + "learning_rate": 6.727426875794829e-07, + "loss": 0.4671, + "mean_token_accuracy": 0.8485354781150818, + "num_tokens": 60647642.0, + "step": 1588 + }, + { + "epoch": 0.20213713268032057, + "ewc_loss": 5.781650543212891e-06, + "grad_norm": 1.5800882577896118, + "learning_rate": 6.731665960152607e-07, + "loss": 0.4419, + "mean_token_accuracy": 0.8540529012680054, + "num_tokens": 60684790.0, + "step": 1589 + }, + { + "epoch": 0.20226434295891108, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.506426215171814, + "learning_rate": 6.735905044510385e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8448641300201416, + "num_tokens": 60725671.0, + "step": 1590 + }, + { + "epoch": 0.2023915532375016, + "ewc_loss": 5.841255187988281e-06, + "grad_norm": 1.6238770484924316, + "learning_rate": 6.740144128868164e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8459383249282837, + "num_tokens": 60761537.0, + "step": 1591 + }, + { + "epoch": 0.2025187635160921, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.5759669542312622, + "learning_rate": 6.744383213225942e-07, + "loss": 0.4961, + "mean_token_accuracy": 0.8401148319244385, + "num_tokens": 60802055.0, + "step": 1592 + }, + { + "epoch": 0.2026459737946826, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.5748419761657715, + "learning_rate": 6.748622297583722e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8440932035446167, + "num_tokens": 60844131.0, + "step": 1593 + }, + { + "epoch": 0.20277318407327313, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.546918272972107, + "learning_rate": 6.7528613819415e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8314073085784912, + "num_tokens": 60885645.0, + "step": 1594 + }, + { + "epoch": 0.20290039435186363, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.7067899703979492, + "learning_rate": 6.757100466299279e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.8487805128097534, + "num_tokens": 60925174.0, + "step": 1595 + }, + { + "epoch": 0.20302760463045413, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.588418960571289, + "learning_rate": 6.761339550657058e-07, + "loss": 0.4126, + "mean_token_accuracy": 0.866041362285614, + "num_tokens": 60963190.0, + "step": 1596 + }, + { + "epoch": 0.20315481490904466, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.6123690605163574, + "learning_rate": 6.765578635014837e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8417201638221741, + "num_tokens": 61004892.0, + "step": 1597 + }, + { + "epoch": 0.20328202518763516, + "ewc_loss": 5.8710575103759766e-06, + "grad_norm": 1.6454020738601685, + "learning_rate": 6.769817719372614e-07, + "loss": 0.46, + "mean_token_accuracy": 0.8469011187553406, + "num_tokens": 61044217.0, + "step": 1598 + }, + { + "epoch": 0.20340923546622566, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.6724166870117188, + "learning_rate": 6.774056803730394e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8488122820854187, + "num_tokens": 61077444.0, + "step": 1599 + }, + { + "epoch": 0.2035364457448162, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.5671190023422241, + "learning_rate": 6.778295888088172e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.839389443397522, + "num_tokens": 61117510.0, + "step": 1600 + }, + { + "epoch": 0.2036636560234067, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.5177260637283325, + "learning_rate": 6.782534972445952e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8500640988349915, + "num_tokens": 61158532.0, + "step": 1601 + }, + { + "epoch": 0.2037908663019972, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.51376211643219, + "learning_rate": 6.78677405680373e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.8624460697174072, + "num_tokens": 61194406.0, + "step": 1602 + }, + { + "epoch": 0.20391807658058772, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.7132972478866577, + "learning_rate": 6.791013141161509e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8406973481178284, + "num_tokens": 61230323.0, + "step": 1603 + }, + { + "epoch": 0.20404528685917822, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.5813592672348022, + "learning_rate": 6.795252225519288e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8430484533309937, + "num_tokens": 61268273.0, + "step": 1604 + }, + { + "epoch": 0.20417249713776872, + "ewc_loss": 5.930662155151367e-06, + "grad_norm": 1.6015137434005737, + "learning_rate": 6.799491309877067e-07, + "loss": 0.4314, + "mean_token_accuracy": 0.8595465421676636, + "num_tokens": 61304719.0, + "step": 1605 + }, + { + "epoch": 0.20429970741635925, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.5846799612045288, + "learning_rate": 6.803730394234844e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8509184122085571, + "num_tokens": 61343905.0, + "step": 1606 + }, + { + "epoch": 0.20442691769494975, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.6195428371429443, + "learning_rate": 6.807969478592624e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8442249298095703, + "num_tokens": 61382846.0, + "step": 1607 + }, + { + "epoch": 0.20455412797354025, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.614927887916565, + "learning_rate": 6.812208562950402e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8445888757705688, + "num_tokens": 61420388.0, + "step": 1608 + }, + { + "epoch": 0.20468133825213078, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.5120583772659302, + "learning_rate": 6.816447647308182e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8534024357795715, + "num_tokens": 61462023.0, + "step": 1609 + }, + { + "epoch": 0.20480854853072128, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.6138191223144531, + "learning_rate": 6.82068673166596e-07, + "loss": 0.4758, + "mean_token_accuracy": 0.8458964228630066, + "num_tokens": 61500538.0, + "step": 1610 + }, + { + "epoch": 0.20493575880931178, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.782222867012024, + "learning_rate": 6.824925816023738e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8312358856201172, + "num_tokens": 61540304.0, + "step": 1611 + }, + { + "epoch": 0.2050629690879023, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.7117739915847778, + "learning_rate": 6.829164900381518e-07, + "loss": 0.5075, + "mean_token_accuracy": 0.8364423513412476, + "num_tokens": 61576973.0, + "step": 1612 + }, + { + "epoch": 0.2051901793664928, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.5463498830795288, + "learning_rate": 6.833403984739295e-07, + "loss": 0.4197, + "mean_token_accuracy": 0.8609534502029419, + "num_tokens": 61612464.0, + "step": 1613 + }, + { + "epoch": 0.2053173896450833, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.6242352724075317, + "learning_rate": 6.837643069097074e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8413263559341431, + "num_tokens": 61650815.0, + "step": 1614 + }, + { + "epoch": 0.20544459992367384, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.7039812803268433, + "learning_rate": 6.841882153454853e-07, + "loss": 0.4512, + "mean_token_accuracy": 0.8531612157821655, + "num_tokens": 61685844.0, + "step": 1615 + }, + { + "epoch": 0.20557181020226434, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.6193875074386597, + "learning_rate": 6.846121237812632e-07, + "loss": 0.4141, + "mean_token_accuracy": 0.8661351203918457, + "num_tokens": 61722198.0, + "step": 1616 + }, + { + "epoch": 0.20569902048085487, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.469926118850708, + "learning_rate": 6.850360322170411e-07, + "loss": 0.3901, + "mean_token_accuracy": 0.8700950741767883, + "num_tokens": 61764075.0, + "step": 1617 + }, + { + "epoch": 0.20582623075944537, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.724733591079712, + "learning_rate": 6.85459940652819e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.8587671518325806, + "num_tokens": 61794384.0, + "step": 1618 + }, + { + "epoch": 0.20595344103803587, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.4441752433776855, + "learning_rate": 6.858838490885968e-07, + "loss": 0.4366, + "mean_token_accuracy": 0.8583670258522034, + "num_tokens": 61837218.0, + "step": 1619 + }, + { + "epoch": 0.2060806513166264, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.5426111221313477, + "learning_rate": 6.863077575243748e-07, + "loss": 0.4287, + "mean_token_accuracy": 0.859146237373352, + "num_tokens": 61879564.0, + "step": 1620 + }, + { + "epoch": 0.2062078615952169, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.6637217998504639, + "learning_rate": 6.867316659601525e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8436462879180908, + "num_tokens": 61918089.0, + "step": 1621 + }, + { + "epoch": 0.2063350718738074, + "ewc_loss": 5.9604644775390625e-06, + "grad_norm": 1.8220378160476685, + "learning_rate": 6.871555743959304e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8486970067024231, + "num_tokens": 61948428.0, + "step": 1622 + }, + { + "epoch": 0.20646228215239792, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.5401430130004883, + "learning_rate": 6.875794828317083e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.8506686091423035, + "num_tokens": 61991282.0, + "step": 1623 + }, + { + "epoch": 0.20658949243098843, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.6500353813171387, + "learning_rate": 6.880033912674862e-07, + "loss": 0.4731, + "mean_token_accuracy": 0.8478263020515442, + "num_tokens": 62028499.0, + "step": 1624 + }, + { + "epoch": 0.20671670270957893, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.5081844329833984, + "learning_rate": 6.884272997032641e-07, + "loss": 0.4167, + "mean_token_accuracy": 0.8602029085159302, + "num_tokens": 62070495.0, + "step": 1625 + }, + { + "epoch": 0.20684391298816945, + "ewc_loss": 6.020069122314453e-06, + "grad_norm": 1.5191502571105957, + "learning_rate": 6.88851208139042e-07, + "loss": 0.4273, + "mean_token_accuracy": 0.8627461194992065, + "num_tokens": 62111122.0, + "step": 1626 + }, + { + "epoch": 0.20697112326675995, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.7651609182357788, + "learning_rate": 6.892751165748198e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8501862287521362, + "num_tokens": 62143940.0, + "step": 1627 + }, + { + "epoch": 0.20709833354535045, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.4981303215026855, + "learning_rate": 6.896990250105978e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8615936040878296, + "num_tokens": 62182084.0, + "step": 1628 + }, + { + "epoch": 0.20722554382394098, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.5528738498687744, + "learning_rate": 6.901229334463755e-07, + "loss": 0.4194, + "mean_token_accuracy": 0.861120343208313, + "num_tokens": 62222283.0, + "step": 1629 + }, + { + "epoch": 0.20735275410253148, + "ewc_loss": 6.0498714447021484e-06, + "grad_norm": 1.8975212574005127, + "learning_rate": 6.905468418821534e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8415098190307617, + "num_tokens": 62254190.0, + "step": 1630 + }, + { + "epoch": 0.20747996438112198, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.785309076309204, + "learning_rate": 6.909707503179313e-07, + "loss": 0.4204, + "mean_token_accuracy": 0.8584656715393066, + "num_tokens": 62284846.0, + "step": 1631 + }, + { + "epoch": 0.2076071746597125, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.6942182779312134, + "learning_rate": 6.913946587537091e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.8487718105316162, + "num_tokens": 62318505.0, + "step": 1632 + }, + { + "epoch": 0.207734384938303, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.625423789024353, + "learning_rate": 6.918185671894871e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8549848794937134, + "num_tokens": 62356074.0, + "step": 1633 + }, + { + "epoch": 0.2078615952168935, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.7267118692398071, + "learning_rate": 6.922424756252649e-07, + "loss": 0.4238, + "mean_token_accuracy": 0.8542388081550598, + "num_tokens": 62390272.0, + "step": 1634 + }, + { + "epoch": 0.20798880549548404, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.843988299369812, + "learning_rate": 6.926663840610428e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8361178040504456, + "num_tokens": 62420769.0, + "step": 1635 + }, + { + "epoch": 0.20811601577407454, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.5890488624572754, + "learning_rate": 6.930902924968206e-07, + "loss": 0.4227, + "mean_token_accuracy": 0.8622256517410278, + "num_tokens": 62456147.0, + "step": 1636 + }, + { + "epoch": 0.20824322605266504, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.5862586498260498, + "learning_rate": 6.935142009325985e-07, + "loss": 0.3728, + "mean_token_accuracy": 0.8743159770965576, + "num_tokens": 62490722.0, + "step": 1637 + }, + { + "epoch": 0.20837043633125557, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.73491632938385, + "learning_rate": 6.939381093683764e-07, + "loss": 0.388, + "mean_token_accuracy": 0.8684195876121521, + "num_tokens": 62525803.0, + "step": 1638 + }, + { + "epoch": 0.20849764660984607, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.5514605045318604, + "learning_rate": 6.943620178041543e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.8442314863204956, + "num_tokens": 62569531.0, + "step": 1639 + }, + { + "epoch": 0.2086248568884366, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.534284234046936, + "learning_rate": 6.947859262399321e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8541021943092346, + "num_tokens": 62612573.0, + "step": 1640 + }, + { + "epoch": 0.2087520671670271, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.5370604991912842, + "learning_rate": 6.952098346757101e-07, + "loss": 0.452, + "mean_token_accuracy": 0.8520109057426453, + "num_tokens": 62656060.0, + "step": 1641 + }, + { + "epoch": 0.2088792774456176, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.9517295360565186, + "learning_rate": 6.956337431114879e-07, + "loss": 0.3807, + "mean_token_accuracy": 0.8707040548324585, + "num_tokens": 62689074.0, + "step": 1642 + }, + { + "epoch": 0.20900648772420813, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.5998398065567017, + "learning_rate": 6.960576515472658e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.8529510498046875, + "num_tokens": 62728451.0, + "step": 1643 + }, + { + "epoch": 0.20913369800279863, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.647834300994873, + "learning_rate": 6.964815599830436e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.846820592880249, + "num_tokens": 62767470.0, + "step": 1644 + }, + { + "epoch": 0.20926090828138913, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.6274601221084595, + "learning_rate": 6.969054684188215e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8536732196807861, + "num_tokens": 62804459.0, + "step": 1645 + }, + { + "epoch": 0.20938811855997966, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.5885601043701172, + "learning_rate": 6.973293768545994e-07, + "loss": 0.4215, + "mean_token_accuracy": 0.860743522644043, + "num_tokens": 62840042.0, + "step": 1646 + }, + { + "epoch": 0.20951532883857016, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.583590030670166, + "learning_rate": 6.977532852903773e-07, + "loss": 0.4465, + "mean_token_accuracy": 0.855277419090271, + "num_tokens": 62877999.0, + "step": 1647 + }, + { + "epoch": 0.20964253911716066, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.8301568031311035, + "learning_rate": 6.981771937261551e-07, + "loss": 0.5054, + "mean_token_accuracy": 0.8334455490112305, + "num_tokens": 62908411.0, + "step": 1648 + }, + { + "epoch": 0.2097697493957512, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.6975467205047607, + "learning_rate": 6.986011021619331e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.848899245262146, + "num_tokens": 62945332.0, + "step": 1649 + }, + { + "epoch": 0.2098969596743417, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.732474446296692, + "learning_rate": 6.990250105977109e-07, + "loss": 0.4158, + "mean_token_accuracy": 0.8617637157440186, + "num_tokens": 62980118.0, + "step": 1650 + }, + { + "epoch": 0.2100241699529322, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.665750503540039, + "learning_rate": 6.994489190334886e-07, + "loss": 0.409, + "mean_token_accuracy": 0.8683139681816101, + "num_tokens": 63016268.0, + "step": 1651 + }, + { + "epoch": 0.21015138023152272, + "ewc_loss": 6.079673767089844e-06, + "grad_norm": 1.595188856124878, + "learning_rate": 6.998728274692666e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8470157384872437, + "num_tokens": 63057883.0, + "step": 1652 + }, + { + "epoch": 0.21027859051011322, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.6782852411270142, + "learning_rate": 7.002967359050444e-07, + "loss": 0.5118, + "mean_token_accuracy": 0.8316522240638733, + "num_tokens": 63097503.0, + "step": 1653 + }, + { + "epoch": 0.21040580078870372, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.5876013040542603, + "learning_rate": 7.007206443408224e-07, + "loss": 0.4322, + "mean_token_accuracy": 0.858380138874054, + "num_tokens": 63135265.0, + "step": 1654 + }, + { + "epoch": 0.21053301106729425, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.5810049772262573, + "learning_rate": 7.011445527766002e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8339793682098389, + "num_tokens": 63178665.0, + "step": 1655 + }, + { + "epoch": 0.21066022134588475, + "ewc_loss": 6.109476089477539e-06, + "grad_norm": 1.5021413564682007, + "learning_rate": 7.015684612123781e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.8526220321655273, + "num_tokens": 63224590.0, + "step": 1656 + }, + { + "epoch": 0.21078743162447525, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5342401266098022, + "learning_rate": 7.01992369648156e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8515327572822571, + "num_tokens": 63269024.0, + "step": 1657 + }, + { + "epoch": 0.21091464190306577, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.6031343936920166, + "learning_rate": 7.024162780839339e-07, + "loss": 0.4223, + "mean_token_accuracy": 0.8626426458358765, + "num_tokens": 63306880.0, + "step": 1658 + }, + { + "epoch": 0.21104185218165628, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.822303295135498, + "learning_rate": 7.028401865197116e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.8315796256065369, + "num_tokens": 63338877.0, + "step": 1659 + }, + { + "epoch": 0.21116906246024678, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5897326469421387, + "learning_rate": 7.032640949554896e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8505430221557617, + "num_tokens": 63378434.0, + "step": 1660 + }, + { + "epoch": 0.2112962727388373, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5777411460876465, + "learning_rate": 7.036880033912674e-07, + "loss": 0.5065, + "mean_token_accuracy": 0.8356228470802307, + "num_tokens": 63418743.0, + "step": 1661 + }, + { + "epoch": 0.2114234830174278, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5180283784866333, + "learning_rate": 7.041119118270454e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8618847131729126, + "num_tokens": 63457842.0, + "step": 1662 + }, + { + "epoch": 0.2115506932960183, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5660661458969116, + "learning_rate": 7.045358202628232e-07, + "loss": 0.457, + "mean_token_accuracy": 0.852627158164978, + "num_tokens": 63499617.0, + "step": 1663 + }, + { + "epoch": 0.21167790357460883, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.595963954925537, + "learning_rate": 7.049597286986011e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8543177247047424, + "num_tokens": 63538930.0, + "step": 1664 + }, + { + "epoch": 0.21180511385319933, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5955753326416016, + "learning_rate": 7.05383637134379e-07, + "loss": 0.4093, + "mean_token_accuracy": 0.8627053499221802, + "num_tokens": 63576405.0, + "step": 1665 + }, + { + "epoch": 0.21193232413178986, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.8458195924758911, + "learning_rate": 7.058075455701568e-07, + "loss": 0.5043, + "mean_token_accuracy": 0.8378952741622925, + "num_tokens": 63609563.0, + "step": 1666 + }, + { + "epoch": 0.21205953441038036, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.6107921600341797, + "learning_rate": 7.062314540059346e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8498939275741577, + "num_tokens": 63648982.0, + "step": 1667 + }, + { + "epoch": 0.21218674468897086, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.749038815498352, + "learning_rate": 7.066553624417126e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8342058062553406, + "num_tokens": 63687237.0, + "step": 1668 + }, + { + "epoch": 0.2123139549675614, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.6592437028884888, + "learning_rate": 7.070792708774904e-07, + "loss": 0.4366, + "mean_token_accuracy": 0.8516233563423157, + "num_tokens": 63721259.0, + "step": 1669 + }, + { + "epoch": 0.2124411652461519, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5359301567077637, + "learning_rate": 7.075031793132684e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8526729345321655, + "num_tokens": 63762170.0, + "step": 1670 + }, + { + "epoch": 0.2125683755247424, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5110430717468262, + "learning_rate": 7.079270877490462e-07, + "loss": 0.4365, + "mean_token_accuracy": 0.861926794052124, + "num_tokens": 63802440.0, + "step": 1671 + }, + { + "epoch": 0.21269558580333292, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.5434792041778564, + "learning_rate": 7.08350996184824e-07, + "loss": 0.4654, + "mean_token_accuracy": 0.8439940810203552, + "num_tokens": 63842271.0, + "step": 1672 + }, + { + "epoch": 0.21282279608192342, + "ewc_loss": 6.139278411865234e-06, + "grad_norm": 1.6592950820922852, + "learning_rate": 7.08774904620602e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8523674011230469, + "num_tokens": 63877905.0, + "step": 1673 + }, + { + "epoch": 0.21295000636051392, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.8359025716781616, + "learning_rate": 7.091988130563797e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8389511108398438, + "num_tokens": 63911840.0, + "step": 1674 + }, + { + "epoch": 0.21307721663910445, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.6022028923034668, + "learning_rate": 7.096227214921576e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8355559706687927, + "num_tokens": 63949751.0, + "step": 1675 + }, + { + "epoch": 0.21320442691769495, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.5531847476959229, + "learning_rate": 7.100466299279355e-07, + "loss": 0.3814, + "mean_token_accuracy": 0.8740274906158447, + "num_tokens": 63986073.0, + "step": 1676 + }, + { + "epoch": 0.21333163719628545, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.4828574657440186, + "learning_rate": 7.104705383637134e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.847610354423523, + "num_tokens": 64027920.0, + "step": 1677 + }, + { + "epoch": 0.21345884747487598, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.507257342338562, + "learning_rate": 7.108944467994913e-07, + "loss": 0.4376, + "mean_token_accuracy": 0.8545538783073425, + "num_tokens": 64069861.0, + "step": 1678 + }, + { + "epoch": 0.21358605775346648, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.7374616861343384, + "learning_rate": 7.113183552352692e-07, + "loss": 0.3933, + "mean_token_accuracy": 0.8689917325973511, + "num_tokens": 64100042.0, + "step": 1679 + }, + { + "epoch": 0.21371326803205698, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.6129976511001587, + "learning_rate": 7.11742263671047e-07, + "loss": 0.4576, + "mean_token_accuracy": 0.8502664566040039, + "num_tokens": 64138619.0, + "step": 1680 + }, + { + "epoch": 0.2138404783106475, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.5585309267044067, + "learning_rate": 7.12166172106825e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8530703783035278, + "num_tokens": 64182342.0, + "step": 1681 + }, + { + "epoch": 0.213967688589238, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 2.284916877746582, + "learning_rate": 7.125900805426027e-07, + "loss": 0.453, + "mean_token_accuracy": 0.8488555550575256, + "num_tokens": 64217975.0, + "step": 1682 + }, + { + "epoch": 0.2140948988678285, + "ewc_loss": 6.16908073425293e-06, + "grad_norm": 1.534466028213501, + "learning_rate": 7.130139889783806e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8489062786102295, + "num_tokens": 64261238.0, + "step": 1683 + }, + { + "epoch": 0.21422210914641904, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.4905004501342773, + "learning_rate": 7.134378974141585e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8532918691635132, + "num_tokens": 64301429.0, + "step": 1684 + }, + { + "epoch": 0.21434931942500954, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.5828813314437866, + "learning_rate": 7.138618058499364e-07, + "loss": 0.4174, + "mean_token_accuracy": 0.862628698348999, + "num_tokens": 64339795.0, + "step": 1685 + }, + { + "epoch": 0.21447652970360004, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.5158600807189941, + "learning_rate": 7.142857142857143e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8525874614715576, + "num_tokens": 64382104.0, + "step": 1686 + }, + { + "epoch": 0.21460373998219057, + "ewc_loss": 6.198883056640625e-06, + "grad_norm": 1.4591087102890015, + "learning_rate": 7.147096227214922e-07, + "loss": 0.4387, + "mean_token_accuracy": 0.8543601036071777, + "num_tokens": 64423890.0, + "step": 1687 + }, + { + "epoch": 0.21473095026078107, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.5762615203857422, + "learning_rate": 7.1513353115727e-07, + "loss": 0.4097, + "mean_token_accuracy": 0.863662838935852, + "num_tokens": 64459366.0, + "step": 1688 + }, + { + "epoch": 0.21485816053937157, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.6266076564788818, + "learning_rate": 7.155574395930479e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8485100865364075, + "num_tokens": 64497524.0, + "step": 1689 + }, + { + "epoch": 0.2149853708179621, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.5612353086471558, + "learning_rate": 7.159813480288257e-07, + "loss": 0.4368, + "mean_token_accuracy": 0.8559277057647705, + "num_tokens": 64534883.0, + "step": 1690 + }, + { + "epoch": 0.2151125810965526, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.6956411600112915, + "learning_rate": 7.164052564646035e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8474181294441223, + "num_tokens": 64566266.0, + "step": 1691 + }, + { + "epoch": 0.21523979137514312, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.7637609243392944, + "learning_rate": 7.168291649003815e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8492006063461304, + "num_tokens": 64600493.0, + "step": 1692 + }, + { + "epoch": 0.21536700165373363, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.5500280857086182, + "learning_rate": 7.172530733361593e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8643180131912231, + "num_tokens": 64639951.0, + "step": 1693 + }, + { + "epoch": 0.21549421193232413, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.4791107177734375, + "learning_rate": 7.176769817719373e-07, + "loss": 0.4224, + "mean_token_accuracy": 0.8594855666160583, + "num_tokens": 64682160.0, + "step": 1694 + }, + { + "epoch": 0.21562142221091465, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.6801550388336182, + "learning_rate": 7.181008902077151e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8595228791236877, + "num_tokens": 64718162.0, + "step": 1695 + }, + { + "epoch": 0.21574863248950515, + "ewc_loss": 6.22868537902832e-06, + "grad_norm": 1.6071696281433105, + "learning_rate": 7.18524798643493e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8493672609329224, + "num_tokens": 64756421.0, + "step": 1696 + }, + { + "epoch": 0.21587584276809565, + "ewc_loss": 6.258487701416016e-06, + "grad_norm": 1.7834985256195068, + "learning_rate": 7.189487070792708e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8459621667861938, + "num_tokens": 64792142.0, + "step": 1697 + }, + { + "epoch": 0.21600305304668618, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.5151864290237427, + "learning_rate": 7.193726155150487e-07, + "loss": 0.3988, + "mean_token_accuracy": 0.8677418828010559, + "num_tokens": 64832629.0, + "step": 1698 + }, + { + "epoch": 0.21613026332527668, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.568325400352478, + "learning_rate": 7.197965239508265e-07, + "loss": 0.3945, + "mean_token_accuracy": 0.869666337966919, + "num_tokens": 64869744.0, + "step": 1699 + }, + { + "epoch": 0.21625747360386718, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.6377067565917969, + "learning_rate": 7.202204323866045e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8503332138061523, + "num_tokens": 64909872.0, + "step": 1700 + }, + { + "epoch": 0.2163846838824577, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7919831275939941, + "learning_rate": 7.206443408223823e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8604774475097656, + "num_tokens": 64946436.0, + "step": 1701 + }, + { + "epoch": 0.2165118941610482, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.6253252029418945, + "learning_rate": 7.210682492581603e-07, + "loss": 0.4454, + "mean_token_accuracy": 0.853103518486023, + "num_tokens": 64987834.0, + "step": 1702 + }, + { + "epoch": 0.2166391044396387, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.5962783098220825, + "learning_rate": 7.214921576939381e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8552820682525635, + "num_tokens": 65026880.0, + "step": 1703 + }, + { + "epoch": 0.21676631471822924, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 2.33000111579895, + "learning_rate": 7.219160661297159e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8357076048851013, + "num_tokens": 65063300.0, + "step": 1704 + }, + { + "epoch": 0.21689352499681974, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7076952457427979, + "learning_rate": 7.223399745654938e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8358102440834045, + "num_tokens": 65102784.0, + "step": 1705 + }, + { + "epoch": 0.21702073527541024, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.594636082649231, + "learning_rate": 7.227638830012717e-07, + "loss": 0.4486, + "mean_token_accuracy": 0.85662442445755, + "num_tokens": 65140764.0, + "step": 1706 + }, + { + "epoch": 0.21714794555400077, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7436583042144775, + "learning_rate": 7.231877914370495e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8505535125732422, + "num_tokens": 65173764.0, + "step": 1707 + }, + { + "epoch": 0.21727515583259127, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7518441677093506, + "learning_rate": 7.236116998728275e-07, + "loss": 0.5479, + "mean_token_accuracy": 0.8218586444854736, + "num_tokens": 65211744.0, + "step": 1708 + }, + { + "epoch": 0.21740236611118177, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.4445948600769043, + "learning_rate": 7.240356083086053e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8548835515975952, + "num_tokens": 65254857.0, + "step": 1709 + }, + { + "epoch": 0.2175295763897723, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7980945110321045, + "learning_rate": 7.244595167443833e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8282308578491211, + "num_tokens": 65287918.0, + "step": 1710 + }, + { + "epoch": 0.2176567866683628, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.7346054315567017, + "learning_rate": 7.248834251801611e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.8534646034240723, + "num_tokens": 65320884.0, + "step": 1711 + }, + { + "epoch": 0.2177839969469533, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.5299031734466553, + "learning_rate": 7.253073336159388e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.856421947479248, + "num_tokens": 65362640.0, + "step": 1712 + }, + { + "epoch": 0.21791120722554383, + "ewc_loss": 6.288290023803711e-06, + "grad_norm": 1.6463744640350342, + "learning_rate": 7.257312420517168e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8548184633255005, + "num_tokens": 65397302.0, + "step": 1713 + }, + { + "epoch": 0.21803841750413433, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.544310212135315, + "learning_rate": 7.261551504874946e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8504899144172668, + "num_tokens": 65438583.0, + "step": 1714 + }, + { + "epoch": 0.21816562778272486, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.5728107690811157, + "learning_rate": 7.265790589232725e-07, + "loss": 0.4505, + "mean_token_accuracy": 0.8500069379806519, + "num_tokens": 65480546.0, + "step": 1715 + }, + { + "epoch": 0.21829283806131536, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.684572696685791, + "learning_rate": 7.270029673590504e-07, + "loss": 0.4505, + "mean_token_accuracy": 0.8485297560691833, + "num_tokens": 65514862.0, + "step": 1716 + }, + { + "epoch": 0.21842004833990586, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.5859477519989014, + "learning_rate": 7.274268757948283e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8541357517242432, + "num_tokens": 65556017.0, + "step": 1717 + }, + { + "epoch": 0.2185472586184964, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.6608473062515259, + "learning_rate": 7.278507842306062e-07, + "loss": 0.4136, + "mean_token_accuracy": 0.8629994988441467, + "num_tokens": 65591591.0, + "step": 1718 + }, + { + "epoch": 0.2186744688970869, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.5784717798233032, + "learning_rate": 7.282746926663841e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8395847082138062, + "num_tokens": 65634362.0, + "step": 1719 + }, + { + "epoch": 0.2188016791756774, + "ewc_loss": 6.318092346191406e-06, + "grad_norm": 1.7276402711868286, + "learning_rate": 7.286986011021618e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8356107473373413, + "num_tokens": 65668843.0, + "step": 1720 + }, + { + "epoch": 0.21892888945426792, + "ewc_loss": 6.3478946685791016e-06, + "grad_norm": 1.6301230192184448, + "learning_rate": 7.291225095379398e-07, + "loss": 0.4246, + "mean_token_accuracy": 0.8572134971618652, + "num_tokens": 65708082.0, + "step": 1721 + }, + { + "epoch": 0.21905609973285842, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7173906564712524, + "learning_rate": 7.295464179737176e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8503930568695068, + "num_tokens": 65748477.0, + "step": 1722 + }, + { + "epoch": 0.21918331001144892, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.5252844095230103, + "learning_rate": 7.299703264094955e-07, + "loss": 0.3987, + "mean_token_accuracy": 0.8687469959259033, + "num_tokens": 65789665.0, + "step": 1723 + }, + { + "epoch": 0.21931052029003945, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.6499069929122925, + "learning_rate": 7.303942348452734e-07, + "loss": 0.4174, + "mean_token_accuracy": 0.8604744672775269, + "num_tokens": 65824062.0, + "step": 1724 + }, + { + "epoch": 0.21943773056862995, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.9582815170288086, + "learning_rate": 7.308181432810513e-07, + "loss": 0.4454, + "mean_token_accuracy": 0.8546490669250488, + "num_tokens": 65858880.0, + "step": 1725 + }, + { + "epoch": 0.21956494084722045, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.58559250831604, + "learning_rate": 7.312420517168292e-07, + "loss": 0.4067, + "mean_token_accuracy": 0.8654052019119263, + "num_tokens": 65893836.0, + "step": 1726 + }, + { + "epoch": 0.21969215112581097, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.7085734605789185, + "learning_rate": 7.31665960152607e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8489817976951599, + "num_tokens": 65930040.0, + "step": 1727 + }, + { + "epoch": 0.21981936140440148, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.5483835935592651, + "learning_rate": 7.320898685883848e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8259566426277161, + "num_tokens": 65972652.0, + "step": 1728 + }, + { + "epoch": 0.21994657168299198, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.8150572776794434, + "learning_rate": 7.325137770241628e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8415251970291138, + "num_tokens": 66007606.0, + "step": 1729 + }, + { + "epoch": 0.2200737819615825, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.5439497232437134, + "learning_rate": 7.329376854599406e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.84111487865448, + "num_tokens": 66052501.0, + "step": 1730 + }, + { + "epoch": 0.220200992240173, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.6963471174240112, + "learning_rate": 7.333615938957184e-07, + "loss": 0.4438, + "mean_token_accuracy": 0.8511730432510376, + "num_tokens": 66086378.0, + "step": 1731 + }, + { + "epoch": 0.2203282025187635, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.5813311338424683, + "learning_rate": 7.337855023314964e-07, + "loss": 0.4228, + "mean_token_accuracy": 0.859247088432312, + "num_tokens": 66127081.0, + "step": 1732 + }, + { + "epoch": 0.22045541279735403, + "ewc_loss": 6.377696990966797e-06, + "grad_norm": 1.6103712320327759, + "learning_rate": 7.342094107672742e-07, + "loss": 0.4219, + "mean_token_accuracy": 0.8585337400436401, + "num_tokens": 66165351.0, + "step": 1733 + }, + { + "epoch": 0.22058262307594453, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.5402445793151855, + "learning_rate": 7.346333192030522e-07, + "loss": 0.4715, + "mean_token_accuracy": 0.8496865630149841, + "num_tokens": 66206427.0, + "step": 1734 + }, + { + "epoch": 0.22070983335453503, + "ewc_loss": 6.407499313354492e-06, + "grad_norm": 1.6008497476577759, + "learning_rate": 7.350572276388299e-07, + "loss": 0.4448, + "mean_token_accuracy": 0.8518968224525452, + "num_tokens": 66245308.0, + "step": 1735 + }, + { + "epoch": 0.22083704363312556, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.6069369316101074, + "learning_rate": 7.354811360746078e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8483428359031677, + "num_tokens": 66284585.0, + "step": 1736 + }, + { + "epoch": 0.22096425391171606, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.5262264013290405, + "learning_rate": 7.359050445103857e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8417103886604309, + "num_tokens": 66326688.0, + "step": 1737 + }, + { + "epoch": 0.22109146419030656, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.575535535812378, + "learning_rate": 7.363289529461636e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8413485884666443, + "num_tokens": 66369618.0, + "step": 1738 + }, + { + "epoch": 0.2212186744688971, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.6077158451080322, + "learning_rate": 7.367528613819415e-07, + "loss": 0.426, + "mean_token_accuracy": 0.860095739364624, + "num_tokens": 66402601.0, + "step": 1739 + }, + { + "epoch": 0.2213458847474876, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.515622854232788, + "learning_rate": 7.371767698177194e-07, + "loss": 0.3974, + "mean_token_accuracy": 0.8677875995635986, + "num_tokens": 66442510.0, + "step": 1740 + }, + { + "epoch": 0.22147309502607812, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.4786598682403564, + "learning_rate": 7.376006782534972e-07, + "loss": 0.4635, + "mean_token_accuracy": 0.8486866354942322, + "num_tokens": 66486807.0, + "step": 1741 + }, + { + "epoch": 0.22160030530466862, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.4811009168624878, + "learning_rate": 7.380245866892751e-07, + "loss": 0.4174, + "mean_token_accuracy": 0.86211097240448, + "num_tokens": 66528067.0, + "step": 1742 + }, + { + "epoch": 0.22172751558325912, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.621206521987915, + "learning_rate": 7.384484951250529e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.8666356801986694, + "num_tokens": 66564397.0, + "step": 1743 + }, + { + "epoch": 0.22185472586184965, + "ewc_loss": 6.4373016357421875e-06, + "grad_norm": 1.6418156623840332, + "learning_rate": 7.388724035608308e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8584598302841187, + "num_tokens": 66600872.0, + "step": 1744 + }, + { + "epoch": 0.22198193614044015, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.5078027248382568, + "learning_rate": 7.392963119966087e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.852702260017395, + "num_tokens": 66643698.0, + "step": 1745 + }, + { + "epoch": 0.22210914641903065, + "ewc_loss": 6.467103958129883e-06, + "grad_norm": 1.6984018087387085, + "learning_rate": 7.397202204323866e-07, + "loss": 0.447, + "mean_token_accuracy": 0.8553540706634521, + "num_tokens": 66677814.0, + "step": 1746 + }, + { + "epoch": 0.22223635669762118, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.572564959526062, + "learning_rate": 7.401441288681645e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8589633703231812, + "num_tokens": 66717466.0, + "step": 1747 + }, + { + "epoch": 0.22236356697621168, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.5521196126937866, + "learning_rate": 7.405680373039424e-07, + "loss": 0.441, + "mean_token_accuracy": 0.8545442819595337, + "num_tokens": 66755884.0, + "step": 1748 + }, + { + "epoch": 0.22249077725480218, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6539981365203857, + "learning_rate": 7.409919457397202e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8380017280578613, + "num_tokens": 66797752.0, + "step": 1749 + }, + { + "epoch": 0.2226179875333927, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6557724475860596, + "learning_rate": 7.414158541754981e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8495349884033203, + "num_tokens": 66831546.0, + "step": 1750 + }, + { + "epoch": 0.2227451978119832, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6329147815704346, + "learning_rate": 7.418397626112759e-07, + "loss": 0.4644, + "mean_token_accuracy": 0.8456610441207886, + "num_tokens": 66871713.0, + "step": 1751 + }, + { + "epoch": 0.2228724080905737, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.580453634262085, + "learning_rate": 7.422636710470537e-07, + "loss": 0.4208, + "mean_token_accuracy": 0.8621649742126465, + "num_tokens": 66912247.0, + "step": 1752 + }, + { + "epoch": 0.22299961836916424, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6127427816390991, + "learning_rate": 7.426875794828317e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8496693968772888, + "num_tokens": 66956780.0, + "step": 1753 + }, + { + "epoch": 0.22312682864775474, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.5257660150527954, + "learning_rate": 7.431114879186095e-07, + "loss": 0.3986, + "mean_token_accuracy": 0.8691548109054565, + "num_tokens": 66996904.0, + "step": 1754 + }, + { + "epoch": 0.22325403892634524, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6786799430847168, + "learning_rate": 7.435353963543875e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.856877326965332, + "num_tokens": 67034356.0, + "step": 1755 + }, + { + "epoch": 0.22338124920493577, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6490275859832764, + "learning_rate": 7.439593047901653e-07, + "loss": 0.4446, + "mean_token_accuracy": 0.8534032106399536, + "num_tokens": 67068182.0, + "step": 1756 + }, + { + "epoch": 0.22350845948352627, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7632966041564941, + "learning_rate": 7.443832132259431e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8361592888832092, + "num_tokens": 67104884.0, + "step": 1757 + }, + { + "epoch": 0.22363566976211677, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.5315899848937988, + "learning_rate": 7.44807121661721e-07, + "loss": 0.4354, + "mean_token_accuracy": 0.8568752408027649, + "num_tokens": 67145547.0, + "step": 1758 + }, + { + "epoch": 0.2237628800407073, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.5662832260131836, + "learning_rate": 7.452310300974989e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.8473411202430725, + "num_tokens": 67188423.0, + "step": 1759 + }, + { + "epoch": 0.2238900903192978, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.7368444204330444, + "learning_rate": 7.456549385332767e-07, + "loss": 0.4667, + "mean_token_accuracy": 0.8499590158462524, + "num_tokens": 67220534.0, + "step": 1760 + }, + { + "epoch": 0.2240173005978883, + "ewc_loss": 6.5267086029052734e-06, + "grad_norm": 1.6236821413040161, + "learning_rate": 7.460788469690547e-07, + "loss": 0.4005, + "mean_token_accuracy": 0.8690377473831177, + "num_tokens": 67256012.0, + "step": 1761 + }, + { + "epoch": 0.22414451087647883, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.6284468173980713, + "learning_rate": 7.465027554048325e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8417785167694092, + "num_tokens": 67293671.0, + "step": 1762 + }, + { + "epoch": 0.22427172115506933, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.4572514295578003, + "learning_rate": 7.469266638406105e-07, + "loss": 0.4331, + "mean_token_accuracy": 0.8565115928649902, + "num_tokens": 67340514.0, + "step": 1763 + }, + { + "epoch": 0.22439893143365983, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.6352086067199707, + "learning_rate": 7.473505722763883e-07, + "loss": 0.4069, + "mean_token_accuracy": 0.8587422966957092, + "num_tokens": 67370882.0, + "step": 1764 + }, + { + "epoch": 0.22452614171225035, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.4959698915481567, + "learning_rate": 7.477744807121661e-07, + "loss": 0.4367, + "mean_token_accuracy": 0.852726399898529, + "num_tokens": 67408968.0, + "step": 1765 + }, + { + "epoch": 0.22465335199084085, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.582642912864685, + "learning_rate": 7.48198389147944e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8396098613739014, + "num_tokens": 67449528.0, + "step": 1766 + }, + { + "epoch": 0.22478056226943138, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.5217851400375366, + "learning_rate": 7.486222975837219e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8499049544334412, + "num_tokens": 67491306.0, + "step": 1767 + }, + { + "epoch": 0.22490777254802188, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.833890676498413, + "learning_rate": 7.490462060194997e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.8428369760513306, + "num_tokens": 67521783.0, + "step": 1768 + }, + { + "epoch": 0.22503498282661238, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.660762906074524, + "learning_rate": 7.494701144552777e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8541563153266907, + "num_tokens": 67558501.0, + "step": 1769 + }, + { + "epoch": 0.2251621931052029, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.5660076141357422, + "learning_rate": 7.498940228910555e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8361974954605103, + "num_tokens": 67600562.0, + "step": 1770 + }, + { + "epoch": 0.2252894033837934, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.492851734161377, + "learning_rate": 7.503179313268335e-07, + "loss": 0.4112, + "mean_token_accuracy": 0.864892840385437, + "num_tokens": 67644170.0, + "step": 1771 + }, + { + "epoch": 0.2254166136623839, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.62226140499115, + "learning_rate": 7.507418397626113e-07, + "loss": 0.429, + "mean_token_accuracy": 0.8590890169143677, + "num_tokens": 67680227.0, + "step": 1772 + }, + { + "epoch": 0.22554382394097444, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.6115047931671143, + "learning_rate": 7.51165748198389e-07, + "loss": 0.5235, + "mean_token_accuracy": 0.8311076164245605, + "num_tokens": 67721471.0, + "step": 1773 + }, + { + "epoch": 0.22567103421956494, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.614618182182312, + "learning_rate": 7.51589656634167e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8541820049285889, + "num_tokens": 67759239.0, + "step": 1774 + }, + { + "epoch": 0.22579824449815544, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.6555869579315186, + "learning_rate": 7.520135650699448e-07, + "loss": 0.422, + "mean_token_accuracy": 0.8616266846656799, + "num_tokens": 67794554.0, + "step": 1775 + }, + { + "epoch": 0.22592545477674597, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.5859906673431396, + "learning_rate": 7.524374735057227e-07, + "loss": 0.4392, + "mean_token_accuracy": 0.8635609149932861, + "num_tokens": 67831158.0, + "step": 1776 + }, + { + "epoch": 0.22605266505533647, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.6858093738555908, + "learning_rate": 7.528613819415006e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.8318173885345459, + "num_tokens": 67870262.0, + "step": 1777 + }, + { + "epoch": 0.22617987533392697, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.592788815498352, + "learning_rate": 7.532852903772785e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8461685180664062, + "num_tokens": 67912055.0, + "step": 1778 + }, + { + "epoch": 0.2263070856125175, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.5666877031326294, + "learning_rate": 7.537091988130564e-07, + "loss": 0.4197, + "mean_token_accuracy": 0.8611220121383667, + "num_tokens": 67948801.0, + "step": 1779 + }, + { + "epoch": 0.226434295891108, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.6453148126602173, + "learning_rate": 7.541331072488342e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.8551791906356812, + "num_tokens": 67987530.0, + "step": 1780 + }, + { + "epoch": 0.2265615061696985, + "ewc_loss": 6.586313247680664e-06, + "grad_norm": 1.6593583822250366, + "learning_rate": 7.54557015684612e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8484108448028564, + "num_tokens": 68027335.0, + "step": 1781 + }, + { + "epoch": 0.22668871644828903, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.7961565256118774, + "learning_rate": 7.5498092412039e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8424688577651978, + "num_tokens": 68060595.0, + "step": 1782 + }, + { + "epoch": 0.22681592672687953, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.6684359312057495, + "learning_rate": 7.554048325561678e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8569153547286987, + "num_tokens": 68094520.0, + "step": 1783 + }, + { + "epoch": 0.22694313700547003, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.6223329305648804, + "learning_rate": 7.558287409919457e-07, + "loss": 0.4188, + "mean_token_accuracy": 0.8600320816040039, + "num_tokens": 68129755.0, + "step": 1784 + }, + { + "epoch": 0.22707034728406056, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.656981348991394, + "learning_rate": 7.562526494277236e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.8504701852798462, + "num_tokens": 68166045.0, + "step": 1785 + }, + { + "epoch": 0.22719755756265106, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.6182695627212524, + "learning_rate": 7.566765578635015e-07, + "loss": 0.4233, + "mean_token_accuracy": 0.8619290590286255, + "num_tokens": 68205172.0, + "step": 1786 + }, + { + "epoch": 0.22732476784124156, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.6351155042648315, + "learning_rate": 7.571004662992794e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8398712873458862, + "num_tokens": 68246147.0, + "step": 1787 + }, + { + "epoch": 0.2274519781198321, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.572925090789795, + "learning_rate": 7.575243747350572e-07, + "loss": 0.4475, + "mean_token_accuracy": 0.8545500636100769, + "num_tokens": 68284744.0, + "step": 1788 + }, + { + "epoch": 0.2275791883984226, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.5920758247375488, + "learning_rate": 7.57948283170835e-07, + "loss": 0.4789, + "mean_token_accuracy": 0.841694176197052, + "num_tokens": 68325269.0, + "step": 1789 + }, + { + "epoch": 0.2277063986770131, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.5704225301742554, + "learning_rate": 7.58372191606613e-07, + "loss": 0.4225, + "mean_token_accuracy": 0.8594313859939575, + "num_tokens": 68365790.0, + "step": 1790 + }, + { + "epoch": 0.22783360895560362, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.658223032951355, + "learning_rate": 7.587961000423908e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8456862568855286, + "num_tokens": 68404979.0, + "step": 1791 + }, + { + "epoch": 0.22796081923419412, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.6058812141418457, + "learning_rate": 7.592200084781686e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8458448052406311, + "num_tokens": 68445892.0, + "step": 1792 + }, + { + "epoch": 0.22808802951278465, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.5389567613601685, + "learning_rate": 7.596439169139466e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8523322939872742, + "num_tokens": 68487817.0, + "step": 1793 + }, + { + "epoch": 0.22821523979137515, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.6401821374893188, + "learning_rate": 7.600678253497244e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8552734851837158, + "num_tokens": 68525642.0, + "step": 1794 + }, + { + "epoch": 0.22834245006996565, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.7209655046463013, + "learning_rate": 7.604917337855023e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.8277304172515869, + "num_tokens": 68562914.0, + "step": 1795 + }, + { + "epoch": 0.22846966034855618, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.6415687799453735, + "learning_rate": 7.609156422212801e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8345067501068115, + "num_tokens": 68603649.0, + "step": 1796 + }, + { + "epoch": 0.22859687062714668, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.7095613479614258, + "learning_rate": 7.61339550657058e-07, + "loss": 0.4137, + "mean_token_accuracy": 0.8645728826522827, + "num_tokens": 68635941.0, + "step": 1797 + }, + { + "epoch": 0.22872408090573718, + "ewc_loss": 6.645917892456055e-06, + "grad_norm": 1.7728151082992554, + "learning_rate": 7.617634590928359e-07, + "loss": 0.4544, + "mean_token_accuracy": 0.8538569808006287, + "num_tokens": 68668798.0, + "step": 1798 + }, + { + "epoch": 0.2288512911843277, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.6151654720306396, + "learning_rate": 7.621873675286138e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8477576971054077, + "num_tokens": 68705559.0, + "step": 1799 + }, + { + "epoch": 0.2289785014629182, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.4756251573562622, + "learning_rate": 7.626112759643916e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8413702249526978, + "num_tokens": 68751555.0, + "step": 1800 + }, + { + "epoch": 0.2291057117415087, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.5398956537246704, + "learning_rate": 7.630351844001696e-07, + "loss": 0.3901, + "mean_token_accuracy": 0.8684089779853821, + "num_tokens": 68791184.0, + "step": 1801 + }, + { + "epoch": 0.22923292202009923, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.5859289169311523, + "learning_rate": 7.634590928359474e-07, + "loss": 0.4063, + "mean_token_accuracy": 0.8656061887741089, + "num_tokens": 68832126.0, + "step": 1802 + }, + { + "epoch": 0.22936013229868973, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.6547209024429321, + "learning_rate": 7.638830012717253e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8423213958740234, + "num_tokens": 68872981.0, + "step": 1803 + }, + { + "epoch": 0.22948734257728023, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.648794174194336, + "learning_rate": 7.643069097075031e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8409609794616699, + "num_tokens": 68911300.0, + "step": 1804 + }, + { + "epoch": 0.22961455285587076, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.6028838157653809, + "learning_rate": 7.64730818143281e-07, + "loss": 0.4822, + "mean_token_accuracy": 0.8429352045059204, + "num_tokens": 68947515.0, + "step": 1805 + }, + { + "epoch": 0.22974176313446126, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.5522700548171997, + "learning_rate": 7.651547265790589e-07, + "loss": 0.4986, + "mean_token_accuracy": 0.8369852304458618, + "num_tokens": 68992227.0, + "step": 1806 + }, + { + "epoch": 0.22986897341305176, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.5921761989593506, + "learning_rate": 7.655786350148368e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8447278738021851, + "num_tokens": 69031529.0, + "step": 1807 + }, + { + "epoch": 0.2299961836916423, + "ewc_loss": 6.67572021484375e-06, + "grad_norm": 1.6649278402328491, + "learning_rate": 7.660025434506146e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8399524688720703, + "num_tokens": 69068685.0, + "step": 1808 + }, + { + "epoch": 0.2301233939702328, + "ewc_loss": 6.705522537231445e-06, + "grad_norm": 1.531089186668396, + "learning_rate": 7.664264518863926e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.8586819767951965, + "num_tokens": 69109926.0, + "step": 1809 + }, + { + "epoch": 0.2302506042488233, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.5632082223892212, + "learning_rate": 7.668503603221704e-07, + "loss": 0.4025, + "mean_token_accuracy": 0.8650909066200256, + "num_tokens": 69145900.0, + "step": 1810 + }, + { + "epoch": 0.23037781452741382, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.5748487710952759, + "learning_rate": 7.672742687579483e-07, + "loss": 0.4326, + "mean_token_accuracy": 0.8571382761001587, + "num_tokens": 69191279.0, + "step": 1811 + }, + { + "epoch": 0.23050502480600432, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.708112359046936, + "learning_rate": 7.676981771937261e-07, + "loss": 0.4217, + "mean_token_accuracy": 0.8596580624580383, + "num_tokens": 69225649.0, + "step": 1812 + }, + { + "epoch": 0.23063223508459482, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.541669249534607, + "learning_rate": 7.681220856295039e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8512792587280273, + "num_tokens": 69268088.0, + "step": 1813 + }, + { + "epoch": 0.23075944536318535, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.4840089082717896, + "learning_rate": 7.685459940652819e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8435356616973877, + "num_tokens": 69312271.0, + "step": 1814 + }, + { + "epoch": 0.23088665564177585, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7132933139801025, + "learning_rate": 7.689699025010597e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.836310863494873, + "num_tokens": 69351470.0, + "step": 1815 + }, + { + "epoch": 0.23101386592036638, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.7614049911499023, + "learning_rate": 7.693938109368376e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8400483131408691, + "num_tokens": 69383869.0, + "step": 1816 + }, + { + "epoch": 0.23114107619895688, + "ewc_loss": 6.735324859619141e-06, + "grad_norm": 1.5595561265945435, + "learning_rate": 7.698177193726155e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8461973667144775, + "num_tokens": 69424584.0, + "step": 1817 + }, + { + "epoch": 0.23126828647754738, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.875593662261963, + "learning_rate": 7.702416278083933e-07, + "loss": 0.5027, + "mean_token_accuracy": 0.834478497505188, + "num_tokens": 69458393.0, + "step": 1818 + }, + { + "epoch": 0.2313954967561379, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.6362022161483765, + "learning_rate": 7.706655362441712e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8499287366867065, + "num_tokens": 69494102.0, + "step": 1819 + }, + { + "epoch": 0.2315227070347284, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.45830237865448, + "learning_rate": 7.710894446799491e-07, + "loss": 0.4138, + "mean_token_accuracy": 0.8638033270835876, + "num_tokens": 69536041.0, + "step": 1820 + }, + { + "epoch": 0.2316499173133189, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.6975246667861938, + "learning_rate": 7.715133531157269e-07, + "loss": 0.4184, + "mean_token_accuracy": 0.8590229749679565, + "num_tokens": 69570874.0, + "step": 1821 + }, + { + "epoch": 0.23177712759190944, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.5675235986709595, + "learning_rate": 7.719372615515049e-07, + "loss": 0.4491, + "mean_token_accuracy": 0.852304220199585, + "num_tokens": 69611597.0, + "step": 1822 + }, + { + "epoch": 0.23190433787049994, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.4110069274902344, + "learning_rate": 7.723611699872827e-07, + "loss": 0.3534, + "mean_token_accuracy": 0.8830742835998535, + "num_tokens": 69653728.0, + "step": 1823 + }, + { + "epoch": 0.23203154814909044, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.5743805170059204, + "learning_rate": 7.727850784230606e-07, + "loss": 0.45, + "mean_token_accuracy": 0.852405309677124, + "num_tokens": 69694138.0, + "step": 1824 + }, + { + "epoch": 0.23215875842768097, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.5780560970306396, + "learning_rate": 7.732089868588385e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8481581211090088, + "num_tokens": 69737329.0, + "step": 1825 + }, + { + "epoch": 0.23228596870627147, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.733749508857727, + "learning_rate": 7.736328952946163e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8382976055145264, + "num_tokens": 69772122.0, + "step": 1826 + }, + { + "epoch": 0.23241317898486197, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.5694971084594727, + "learning_rate": 7.740568037303942e-07, + "loss": 0.4583, + "mean_token_accuracy": 0.8494440317153931, + "num_tokens": 69810025.0, + "step": 1827 + }, + { + "epoch": 0.2325403892634525, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.6782381534576416, + "learning_rate": 7.744807121661721e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8382992148399353, + "num_tokens": 69851187.0, + "step": 1828 + }, + { + "epoch": 0.232667599542043, + "ewc_loss": 6.765127182006836e-06, + "grad_norm": 1.7377829551696777, + "learning_rate": 7.749046206019499e-07, + "loss": 0.4314, + "mean_token_accuracy": 0.8502833843231201, + "num_tokens": 69883865.0, + "step": 1829 + }, + { + "epoch": 0.2327948098206335, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.510548710823059, + "learning_rate": 7.753285290377279e-07, + "loss": 0.3918, + "mean_token_accuracy": 0.8695333003997803, + "num_tokens": 69927217.0, + "step": 1830 + }, + { + "epoch": 0.23292202009922403, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.6611168384552002, + "learning_rate": 7.757524374735057e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.8550187945365906, + "num_tokens": 69962482.0, + "step": 1831 + }, + { + "epoch": 0.23304923037781453, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.5698047876358032, + "learning_rate": 7.761763459092836e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8550164699554443, + "num_tokens": 70006224.0, + "step": 1832 + }, + { + "epoch": 0.23317644065640503, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.6523686647415161, + "learning_rate": 7.766002543450614e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.8608771562576294, + "num_tokens": 70043352.0, + "step": 1833 + }, + { + "epoch": 0.23330365093499555, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.5256462097167969, + "learning_rate": 7.770241627808392e-07, + "loss": 0.3836, + "mean_token_accuracy": 0.8727563619613647, + "num_tokens": 70084915.0, + "step": 1834 + }, + { + "epoch": 0.23343086121358606, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.6018036603927612, + "learning_rate": 7.774480712166172e-07, + "loss": 0.4683, + "mean_token_accuracy": 0.8472645282745361, + "num_tokens": 70125427.0, + "step": 1835 + }, + { + "epoch": 0.23355807149217656, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.7932838201522827, + "learning_rate": 7.77871979652395e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8306291103363037, + "num_tokens": 70156361.0, + "step": 1836 + }, + { + "epoch": 0.23368528177076708, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.5659255981445312, + "learning_rate": 7.782958880881729e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8507363796234131, + "num_tokens": 70197690.0, + "step": 1837 + }, + { + "epoch": 0.23381249204935758, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.5945355892181396, + "learning_rate": 7.787197965239508e-07, + "loss": 0.4657, + "mean_token_accuracy": 0.8482480049133301, + "num_tokens": 70240149.0, + "step": 1838 + }, + { + "epoch": 0.23393970232794808, + "ewc_loss": 6.794929504394531e-06, + "grad_norm": 1.60718834400177, + "learning_rate": 7.791437049597287e-07, + "loss": 0.4154, + "mean_token_accuracy": 0.8599350452423096, + "num_tokens": 70278752.0, + "step": 1839 + }, + { + "epoch": 0.2340669126065386, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.6285667419433594, + "learning_rate": 7.795676133955065e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8433336019515991, + "num_tokens": 70317152.0, + "step": 1840 + }, + { + "epoch": 0.2341941228851291, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.7597347497940063, + "learning_rate": 7.799915218312844e-07, + "loss": 0.4079, + "mean_token_accuracy": 0.8680169582366943, + "num_tokens": 70348858.0, + "step": 1841 + }, + { + "epoch": 0.23432133316371964, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.574007272720337, + "learning_rate": 7.804154302670622e-07, + "loss": 0.4596, + "mean_token_accuracy": 0.8468964099884033, + "num_tokens": 70392650.0, + "step": 1842 + }, + { + "epoch": 0.23444854344231014, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.6637862920761108, + "learning_rate": 7.808393387028402e-07, + "loss": 0.4306, + "mean_token_accuracy": 0.854928731918335, + "num_tokens": 70426309.0, + "step": 1843 + }, + { + "epoch": 0.23457575372090064, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.5602744817733765, + "learning_rate": 7.81263247138618e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8478277921676636, + "num_tokens": 70465411.0, + "step": 1844 + }, + { + "epoch": 0.23470296399949117, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.5707712173461914, + "learning_rate": 7.816871555743959e-07, + "loss": 0.4191, + "mean_token_accuracy": 0.8659673929214478, + "num_tokens": 70504978.0, + "step": 1845 + }, + { + "epoch": 0.23483017427808167, + "ewc_loss": 6.8247318267822266e-06, + "grad_norm": 1.6529914140701294, + "learning_rate": 7.821110640101738e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8595728278160095, + "num_tokens": 70538962.0, + "step": 1846 + }, + { + "epoch": 0.23495738455667217, + "ewc_loss": 6.884336471557617e-06, + "grad_norm": 1.6915920972824097, + "learning_rate": 7.825349724459517e-07, + "loss": 0.5161, + "mean_token_accuracy": 0.8316986560821533, + "num_tokens": 70576384.0, + "step": 1847 + }, + { + "epoch": 0.2350845948352627, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.6287403106689453, + "learning_rate": 7.829588808817294e-07, + "loss": 0.5187, + "mean_token_accuracy": 0.8312203884124756, + "num_tokens": 70619840.0, + "step": 1848 + }, + { + "epoch": 0.2352118051138532, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.4700262546539307, + "learning_rate": 7.833827893175074e-07, + "loss": 0.4144, + "mean_token_accuracy": 0.8620109558105469, + "num_tokens": 70660609.0, + "step": 1849 + }, + { + "epoch": 0.2353390153924437, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.6104656457901, + "learning_rate": 7.838066977532852e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8573007583618164, + "num_tokens": 70704359.0, + "step": 1850 + }, + { + "epoch": 0.23546622567103423, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.5883429050445557, + "learning_rate": 7.842306061890632e-07, + "loss": 0.4468, + "mean_token_accuracy": 0.8549773097038269, + "num_tokens": 70742328.0, + "step": 1851 + }, + { + "epoch": 0.23559343594962473, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.601681113243103, + "learning_rate": 7.84654514624841e-07, + "loss": 0.4478, + "mean_token_accuracy": 0.8554273843765259, + "num_tokens": 70779796.0, + "step": 1852 + }, + { + "epoch": 0.23572064622821523, + "ewc_loss": 6.9141387939453125e-06, + "grad_norm": 1.5857658386230469, + "learning_rate": 7.850784230606188e-07, + "loss": 0.488, + "mean_token_accuracy": 0.8376391530036926, + "num_tokens": 70819494.0, + "step": 1853 + }, + { + "epoch": 0.23584785650680576, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7016741037368774, + "learning_rate": 7.855023314963968e-07, + "loss": 0.4299, + "mean_token_accuracy": 0.8580399751663208, + "num_tokens": 70856264.0, + "step": 1854 + }, + { + "epoch": 0.23597506678539626, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.5311710834503174, + "learning_rate": 7.859262399321746e-07, + "loss": 0.423, + "mean_token_accuracy": 0.8584932684898376, + "num_tokens": 70898120.0, + "step": 1855 + }, + { + "epoch": 0.23610227706398676, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.6112810373306274, + "learning_rate": 7.863501483679524e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8493261337280273, + "num_tokens": 70934625.0, + "step": 1856 + }, + { + "epoch": 0.2362294873425773, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.6055021286010742, + "learning_rate": 7.867740568037303e-07, + "loss": 0.4156, + "mean_token_accuracy": 0.8619166612625122, + "num_tokens": 70971141.0, + "step": 1857 + }, + { + "epoch": 0.2363566976211678, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.7564527988433838, + "learning_rate": 7.871979652395082e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8429408669471741, + "num_tokens": 71011703.0, + "step": 1858 + }, + { + "epoch": 0.2364839078997583, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.8056068420410156, + "learning_rate": 7.876218736752861e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8406466841697693, + "num_tokens": 71046386.0, + "step": 1859 + }, + { + "epoch": 0.23661111817834882, + "ewc_loss": 6.973743438720703e-06, + "grad_norm": 1.896544098854065, + "learning_rate": 7.88045782111064e-07, + "loss": 0.4855, + "mean_token_accuracy": 0.8419017791748047, + "num_tokens": 71084741.0, + "step": 1860 + }, + { + "epoch": 0.23673832845693932, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.5487921237945557, + "learning_rate": 7.884696905468418e-07, + "loss": 0.4104, + "mean_token_accuracy": 0.8637028336524963, + "num_tokens": 71128456.0, + "step": 1861 + }, + { + "epoch": 0.23686553873552982, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.723086953163147, + "learning_rate": 7.888935989826198e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8379834890365601, + "num_tokens": 71165779.0, + "step": 1862 + }, + { + "epoch": 0.23699274901412035, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.595583200454712, + "learning_rate": 7.893175074183976e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8311705589294434, + "num_tokens": 71207611.0, + "step": 1863 + }, + { + "epoch": 0.23711995929271085, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.5592586994171143, + "learning_rate": 7.897414158541754e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.852060079574585, + "num_tokens": 71246104.0, + "step": 1864 + }, + { + "epoch": 0.23724716957130135, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.592606544494629, + "learning_rate": 7.901653242899533e-07, + "loss": 0.3994, + "mean_token_accuracy": 0.8625856637954712, + "num_tokens": 71284182.0, + "step": 1865 + }, + { + "epoch": 0.23737437984989188, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.6040455102920532, + "learning_rate": 7.905892327257312e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8491120338439941, + "num_tokens": 71323037.0, + "step": 1866 + }, + { + "epoch": 0.23750159012848238, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.6061310768127441, + "learning_rate": 7.910131411615091e-07, + "loss": 0.4563, + "mean_token_accuracy": 0.8539406061172485, + "num_tokens": 71360240.0, + "step": 1867 + }, + { + "epoch": 0.2376288004070729, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.5928852558135986, + "learning_rate": 7.91437049597287e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8548431396484375, + "num_tokens": 71396564.0, + "step": 1868 + }, + { + "epoch": 0.2377560106856634, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.682062029838562, + "learning_rate": 7.918609580330648e-07, + "loss": 0.4031, + "mean_token_accuracy": 0.8613859415054321, + "num_tokens": 71426328.0, + "step": 1869 + }, + { + "epoch": 0.2378832209642539, + "ewc_loss": 7.033348083496094e-06, + "grad_norm": 1.6266355514526367, + "learning_rate": 7.922848664688428e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8551952838897705, + "num_tokens": 71463884.0, + "step": 1870 + }, + { + "epoch": 0.23801043124284443, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.5070842504501343, + "learning_rate": 7.927087749046205e-07, + "loss": 0.4199, + "mean_token_accuracy": 0.8596386313438416, + "num_tokens": 71503009.0, + "step": 1871 + }, + { + "epoch": 0.23813764152143493, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.670778751373291, + "learning_rate": 7.931326833403983e-07, + "loss": 0.4645, + "mean_token_accuracy": 0.8542008399963379, + "num_tokens": 71544293.0, + "step": 1872 + }, + { + "epoch": 0.23826485180002543, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.7777153253555298, + "learning_rate": 7.935565917761763e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8424426317214966, + "num_tokens": 71578801.0, + "step": 1873 + }, + { + "epoch": 0.23839206207861596, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.6717478036880493, + "learning_rate": 7.939805002119541e-07, + "loss": 0.4115, + "mean_token_accuracy": 0.860026478767395, + "num_tokens": 71617263.0, + "step": 1874 + }, + { + "epoch": 0.23851927235720646, + "ewc_loss": 7.063150405883789e-06, + "grad_norm": 1.5656757354736328, + "learning_rate": 7.944044086477321e-07, + "loss": 0.3855, + "mean_token_accuracy": 0.8722966909408569, + "num_tokens": 71656667.0, + "step": 1875 + }, + { + "epoch": 0.23864648263579696, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.5577555894851685, + "learning_rate": 7.948283170835099e-07, + "loss": 0.4369, + "mean_token_accuracy": 0.8552103042602539, + "num_tokens": 71696255.0, + "step": 1876 + }, + { + "epoch": 0.2387736929143875, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.658165454864502, + "learning_rate": 7.952522255192878e-07, + "loss": 0.4181, + "mean_token_accuracy": 0.8634126782417297, + "num_tokens": 71730935.0, + "step": 1877 + }, + { + "epoch": 0.238900903192978, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.5151299238204956, + "learning_rate": 7.956761339550657e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8570472002029419, + "num_tokens": 71774487.0, + "step": 1878 + }, + { + "epoch": 0.2390281134715685, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.7161215543746948, + "learning_rate": 7.961000423908435e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8480900526046753, + "num_tokens": 71808477.0, + "step": 1879 + }, + { + "epoch": 0.23915532375015902, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.630816102027893, + "learning_rate": 7.965239508266214e-07, + "loss": 0.445, + "mean_token_accuracy": 0.8528872728347778, + "num_tokens": 71848507.0, + "step": 1880 + }, + { + "epoch": 0.23928253402874952, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.5960383415222168, + "learning_rate": 7.969478592623993e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8475135564804077, + "num_tokens": 71889405.0, + "step": 1881 + }, + { + "epoch": 0.23940974430734002, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 2.235853433609009, + "learning_rate": 7.973717676981771e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8442613482475281, + "num_tokens": 71931033.0, + "step": 1882 + }, + { + "epoch": 0.23953695458593055, + "ewc_loss": 7.12275505065918e-06, + "grad_norm": 1.7890855073928833, + "learning_rate": 7.977956761339551e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.8538452386856079, + "num_tokens": 71961145.0, + "step": 1883 + }, + { + "epoch": 0.23966416486452105, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.6708236932754517, + "learning_rate": 7.982195845697329e-07, + "loss": 0.4081, + "mean_token_accuracy": 0.8649923801422119, + "num_tokens": 71995502.0, + "step": 1884 + }, + { + "epoch": 0.23979137514311155, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.713133454322815, + "learning_rate": 7.986434930055108e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8462278842926025, + "num_tokens": 72031102.0, + "step": 1885 + }, + { + "epoch": 0.23991858542170208, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.5552890300750732, + "learning_rate": 7.990674014412886e-07, + "loss": 0.4253, + "mean_token_accuracy": 0.861650824546814, + "num_tokens": 72071275.0, + "step": 1886 + }, + { + "epoch": 0.24004579570029258, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.9188624620437622, + "learning_rate": 7.994913098770665e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8451336622238159, + "num_tokens": 72101036.0, + "step": 1887 + }, + { + "epoch": 0.24017300597888308, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.5781443119049072, + "learning_rate": 7.999152183128444e-07, + "loss": 0.4528, + "mean_token_accuracy": 0.8547824621200562, + "num_tokens": 72140977.0, + "step": 1888 + }, + { + "epoch": 0.2403002162574736, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.611013650894165, + "learning_rate": 8.003391267486223e-07, + "loss": 0.4271, + "mean_token_accuracy": 0.8580306172370911, + "num_tokens": 72175155.0, + "step": 1889 + }, + { + "epoch": 0.2404274265360641, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.5635230541229248, + "learning_rate": 8.007630351844001e-07, + "loss": 0.4218, + "mean_token_accuracy": 0.8591020703315735, + "num_tokens": 72212799.0, + "step": 1890 + }, + { + "epoch": 0.24055463681465464, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.6219565868377686, + "learning_rate": 8.011869436201781e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8540694117546082, + "num_tokens": 72247820.0, + "step": 1891 + }, + { + "epoch": 0.24068184709324514, + "ewc_loss": 7.152557373046875e-06, + "grad_norm": 1.774009108543396, + "learning_rate": 8.016108520559559e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.860197901725769, + "num_tokens": 72278349.0, + "step": 1892 + }, + { + "epoch": 0.24080905737183564, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 1.498904824256897, + "learning_rate": 8.020347604917338e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8584688901901245, + "num_tokens": 72318638.0, + "step": 1893 + }, + { + "epoch": 0.24093626765042617, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 1.6588704586029053, + "learning_rate": 8.024586689275116e-07, + "loss": 0.4284, + "mean_token_accuracy": 0.8553347587585449, + "num_tokens": 72354653.0, + "step": 1894 + }, + { + "epoch": 0.24106347792901667, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 1.787346363067627, + "learning_rate": 8.028825773632894e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8476533889770508, + "num_tokens": 72386069.0, + "step": 1895 + }, + { + "epoch": 0.24119068820760717, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 1.6804313659667969, + "learning_rate": 8.033064857990674e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8525294065475464, + "num_tokens": 72423280.0, + "step": 1896 + }, + { + "epoch": 0.2413178984861977, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 1.662718415260315, + "learning_rate": 8.037303942348452e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8468085527420044, + "num_tokens": 72461482.0, + "step": 1897 + }, + { + "epoch": 0.2414451087647882, + "ewc_loss": 7.18235969543457e-06, + "grad_norm": 1.7031370401382446, + "learning_rate": 8.041543026706231e-07, + "loss": 0.4196, + "mean_token_accuracy": 0.8624659776687622, + "num_tokens": 72494734.0, + "step": 1898 + }, + { + "epoch": 0.2415723190433787, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.6683518886566162, + "learning_rate": 8.04578211106401e-07, + "loss": 0.4762, + "mean_token_accuracy": 0.8420506715774536, + "num_tokens": 72536148.0, + "step": 1899 + }, + { + "epoch": 0.24169952932196923, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.603750467300415, + "learning_rate": 8.050021195421789e-07, + "loss": 0.4893, + "mean_token_accuracy": 0.8431884050369263, + "num_tokens": 72573056.0, + "step": 1900 + }, + { + "epoch": 0.24182673960055973, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.5892971754074097, + "learning_rate": 8.054260279779567e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8570536375045776, + "num_tokens": 72610351.0, + "step": 1901 + }, + { + "epoch": 0.24195394987915023, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.5631769895553589, + "learning_rate": 8.058499364137346e-07, + "loss": 0.406, + "mean_token_accuracy": 0.8656423091888428, + "num_tokens": 72645782.0, + "step": 1902 + }, + { + "epoch": 0.24208116015774075, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.6599647998809814, + "learning_rate": 8.062738448495124e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8620384335517883, + "num_tokens": 72679174.0, + "step": 1903 + }, + { + "epoch": 0.24220837043633126, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.5417999029159546, + "learning_rate": 8.066977532852904e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8590913414955139, + "num_tokens": 72719712.0, + "step": 1904 + }, + { + "epoch": 0.24233558071492176, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.6866731643676758, + "learning_rate": 8.071216617210682e-07, + "loss": 0.476, + "mean_token_accuracy": 0.8436664342880249, + "num_tokens": 72755752.0, + "step": 1905 + }, + { + "epoch": 0.24246279099351228, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.5504204034805298, + "learning_rate": 8.075455701568461e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8484458327293396, + "num_tokens": 72791906.0, + "step": 1906 + }, + { + "epoch": 0.24259000127210278, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.717511534690857, + "learning_rate": 8.07969478592624e-07, + "loss": 0.4587, + "mean_token_accuracy": 0.8479978442192078, + "num_tokens": 72826581.0, + "step": 1907 + }, + { + "epoch": 0.24271721155069328, + "ewc_loss": 7.212162017822266e-06, + "grad_norm": 1.8385602235794067, + "learning_rate": 8.083933870284019e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.8339781761169434, + "num_tokens": 72859504.0, + "step": 1908 + }, + { + "epoch": 0.2428444218292838, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.7569221258163452, + "learning_rate": 8.088172954641796e-07, + "loss": 0.4283, + "mean_token_accuracy": 0.8616385459899902, + "num_tokens": 72891348.0, + "step": 1909 + }, + { + "epoch": 0.2429716321078743, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.4702365398406982, + "learning_rate": 8.092412038999576e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8387570381164551, + "num_tokens": 72934241.0, + "step": 1910 + }, + { + "epoch": 0.24309884238646481, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.54920494556427, + "learning_rate": 8.096651123357354e-07, + "loss": 0.4078, + "mean_token_accuracy": 0.8645678758621216, + "num_tokens": 72972193.0, + "step": 1911 + }, + { + "epoch": 0.24322605266505534, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.527248501777649, + "learning_rate": 8.100890207715134e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.8591634631156921, + "num_tokens": 73008933.0, + "step": 1912 + }, + { + "epoch": 0.24335326294364584, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.5063525438308716, + "learning_rate": 8.105129292072912e-07, + "loss": 0.3996, + "mean_token_accuracy": 0.8660974502563477, + "num_tokens": 73048205.0, + "step": 1913 + }, + { + "epoch": 0.24348047322223634, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.6458083391189575, + "learning_rate": 8.10936837643069e-07, + "loss": 0.4797, + "mean_token_accuracy": 0.8455910086631775, + "num_tokens": 73086606.0, + "step": 1914 + }, + { + "epoch": 0.24360768350082687, + "ewc_loss": 7.271766662597656e-06, + "grad_norm": 1.6199856996536255, + "learning_rate": 8.11360746078847e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8569682240486145, + "num_tokens": 73125278.0, + "step": 1915 + }, + { + "epoch": 0.24373489377941737, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 1.6550801992416382, + "learning_rate": 8.117846545146248e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.8546857237815857, + "num_tokens": 73163158.0, + "step": 1916 + }, + { + "epoch": 0.2438621040580079, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 1.6268086433410645, + "learning_rate": 8.122085629504026e-07, + "loss": 0.4523, + "mean_token_accuracy": 0.8537312150001526, + "num_tokens": 73199180.0, + "step": 1917 + }, + { + "epoch": 0.2439893143365984, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 1.5613179206848145, + "learning_rate": 8.126324713861805e-07, + "loss": 0.5097, + "mean_token_accuracy": 0.8316411972045898, + "num_tokens": 73238426.0, + "step": 1918 + }, + { + "epoch": 0.2441165246151889, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 1.5422528982162476, + "learning_rate": 8.130563798219584e-07, + "loss": 0.4222, + "mean_token_accuracy": 0.8581985235214233, + "num_tokens": 73273722.0, + "step": 1919 + }, + { + "epoch": 0.24424373489377943, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 1.759930968284607, + "learning_rate": 8.134802882577363e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8533415794372559, + "num_tokens": 73305824.0, + "step": 1920 + }, + { + "epoch": 0.24437094517236993, + "ewc_loss": 7.3015689849853516e-06, + "grad_norm": 1.548644781112671, + "learning_rate": 8.139041966935142e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8558687567710876, + "num_tokens": 73348146.0, + "step": 1921 + }, + { + "epoch": 0.24449815545096043, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.643821358680725, + "learning_rate": 8.14328105129292e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8433313369750977, + "num_tokens": 73384439.0, + "step": 1922 + }, + { + "epoch": 0.24462536572955096, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.7527190446853638, + "learning_rate": 8.1475201356507e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8317249417304993, + "num_tokens": 73421245.0, + "step": 1923 + }, + { + "epoch": 0.24475257600814146, + "ewc_loss": 7.331371307373047e-06, + "grad_norm": 1.5391522645950317, + "learning_rate": 8.151759220008477e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.857819676399231, + "num_tokens": 73465371.0, + "step": 1924 + }, + { + "epoch": 0.24487978628673196, + "ewc_loss": 7.361173629760742e-06, + "grad_norm": 1.6310105323791504, + "learning_rate": 8.155998304366256e-07, + "loss": 0.4274, + "mean_token_accuracy": 0.8574104309082031, + "num_tokens": 73502525.0, + "step": 1925 + }, + { + "epoch": 0.2450069965653225, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.6046380996704102, + "learning_rate": 8.160237388724035e-07, + "loss": 0.4112, + "mean_token_accuracy": 0.8636703491210938, + "num_tokens": 73537092.0, + "step": 1926 + }, + { + "epoch": 0.245134206843913, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.6192326545715332, + "learning_rate": 8.164476473081814e-07, + "loss": 0.4414, + "mean_token_accuracy": 0.8554869890213013, + "num_tokens": 73574696.0, + "step": 1927 + }, + { + "epoch": 0.2452614171225035, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.5002226829528809, + "learning_rate": 8.168715557439593e-07, + "loss": 0.4185, + "mean_token_accuracy": 0.8598317503929138, + "num_tokens": 73615188.0, + "step": 1928 + }, + { + "epoch": 0.24538862740109402, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.7281697988510132, + "learning_rate": 8.172954641797372e-07, + "loss": 0.4316, + "mean_token_accuracy": 0.8570253849029541, + "num_tokens": 73651544.0, + "step": 1929 + }, + { + "epoch": 0.24551583767968452, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.6554561853408813, + "learning_rate": 8.17719372615515e-07, + "loss": 0.4658, + "mean_token_accuracy": 0.8473904132843018, + "num_tokens": 73687689.0, + "step": 1930 + }, + { + "epoch": 0.24564304795827502, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.695867657661438, + "learning_rate": 8.18143281051293e-07, + "loss": 0.5119, + "mean_token_accuracy": 0.8297649025917053, + "num_tokens": 73725871.0, + "step": 1931 + }, + { + "epoch": 0.24577025823686555, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.6531609296798706, + "learning_rate": 8.185671894870707e-07, + "loss": 0.377, + "mean_token_accuracy": 0.8722764253616333, + "num_tokens": 73758779.0, + "step": 1932 + }, + { + "epoch": 0.24589746851545605, + "ewc_loss": 7.3909759521484375e-06, + "grad_norm": 1.5523934364318848, + "learning_rate": 8.189910979228485e-07, + "loss": 0.4661, + "mean_token_accuracy": 0.8476400375366211, + "num_tokens": 73801372.0, + "step": 1933 + }, + { + "epoch": 0.24602467879404655, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.5273312330245972, + "learning_rate": 8.194150063586265e-07, + "loss": 0.443, + "mean_token_accuracy": 0.8532491326332092, + "num_tokens": 73845976.0, + "step": 1934 + }, + { + "epoch": 0.24615188907263708, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.5950775146484375, + "learning_rate": 8.198389147944043e-07, + "loss": 0.5042, + "mean_token_accuracy": 0.83432537317276, + "num_tokens": 73886124.0, + "step": 1935 + }, + { + "epoch": 0.24627909935122758, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.5979593992233276, + "learning_rate": 8.202628232301823e-07, + "loss": 0.4453, + "mean_token_accuracy": 0.8532912731170654, + "num_tokens": 73922487.0, + "step": 1936 + }, + { + "epoch": 0.24640630962981808, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.597728967666626, + "learning_rate": 8.206867316659601e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8493644595146179, + "num_tokens": 73962659.0, + "step": 1937 + }, + { + "epoch": 0.2465335199084086, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.4798310995101929, + "learning_rate": 8.21110640101738e-07, + "loss": 0.4572, + "mean_token_accuracy": 0.8478518724441528, + "num_tokens": 74006430.0, + "step": 1938 + }, + { + "epoch": 0.2466607301869991, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.5369545221328735, + "learning_rate": 8.215345485375159e-07, + "loss": 0.3892, + "mean_token_accuracy": 0.8700690269470215, + "num_tokens": 74043157.0, + "step": 1939 + }, + { + "epoch": 0.2467879404655896, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.6661418676376343, + "learning_rate": 8.219584569732937e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8542985916137695, + "num_tokens": 74082596.0, + "step": 1940 + }, + { + "epoch": 0.24691515074418013, + "ewc_loss": 7.450580596923828e-06, + "grad_norm": 1.51669180393219, + "learning_rate": 8.223823654090715e-07, + "loss": 0.3977, + "mean_token_accuracy": 0.8686887621879578, + "num_tokens": 74122979.0, + "step": 1941 + }, + { + "epoch": 0.24704236102277063, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.717085599899292, + "learning_rate": 8.228062738448495e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8369897603988647, + "num_tokens": 74160866.0, + "step": 1942 + }, + { + "epoch": 0.24716957130136116, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.6343905925750732, + "learning_rate": 8.232301822806273e-07, + "loss": 0.451, + "mean_token_accuracy": 0.8501124382019043, + "num_tokens": 74196841.0, + "step": 1943 + }, + { + "epoch": 0.24729678157995166, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.4600588083267212, + "learning_rate": 8.236540907164053e-07, + "loss": 0.411, + "mean_token_accuracy": 0.8634483218193054, + "num_tokens": 74240800.0, + "step": 1944 + }, + { + "epoch": 0.24742399185854216, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.626811146736145, + "learning_rate": 8.240779991521831e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8486413359642029, + "num_tokens": 74277593.0, + "step": 1945 + }, + { + "epoch": 0.2475512021371327, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.513974666595459, + "learning_rate": 8.24501907587961e-07, + "loss": 0.4231, + "mean_token_accuracy": 0.8607869148254395, + "num_tokens": 74320353.0, + "step": 1946 + }, + { + "epoch": 0.2476784124157232, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.6792500019073486, + "learning_rate": 8.249258160237388e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.8515559434890747, + "num_tokens": 74357820.0, + "step": 1947 + }, + { + "epoch": 0.2478056226943137, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.5742734670639038, + "learning_rate": 8.253497244595167e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8490065336227417, + "num_tokens": 74399490.0, + "step": 1948 + }, + { + "epoch": 0.24793283297290422, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.6326773166656494, + "learning_rate": 8.257736328952945e-07, + "loss": 0.4058, + "mean_token_accuracy": 0.8640193939208984, + "num_tokens": 74436750.0, + "step": 1949 + }, + { + "epoch": 0.24806004325149472, + "ewc_loss": 7.4803829193115234e-06, + "grad_norm": 1.5882935523986816, + "learning_rate": 8.261975413310725e-07, + "loss": 0.4256, + "mean_token_accuracy": 0.8606061935424805, + "num_tokens": 74473918.0, + "step": 1950 + }, + { + "epoch": 0.24818725353008522, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.7231107950210571, + "learning_rate": 8.266214497668503e-07, + "loss": 0.4053, + "mean_token_accuracy": 0.8646588325500488, + "num_tokens": 74508675.0, + "step": 1951 + }, + { + "epoch": 0.24831446380867575, + "ewc_loss": 7.510185241699219e-06, + "grad_norm": 1.6100153923034668, + "learning_rate": 8.270453582026283e-07, + "loss": 0.4436, + "mean_token_accuracy": 0.8558099269866943, + "num_tokens": 74549856.0, + "step": 1952 + }, + { + "epoch": 0.24844167408726625, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.606212854385376, + "learning_rate": 8.274692666384061e-07, + "loss": 0.4461, + "mean_token_accuracy": 0.84828782081604, + "num_tokens": 74587977.0, + "step": 1953 + }, + { + "epoch": 0.24856888436585675, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.655910849571228, + "learning_rate": 8.27893175074184e-07, + "loss": 0.4717, + "mean_token_accuracy": 0.848400354385376, + "num_tokens": 74623964.0, + "step": 1954 + }, + { + "epoch": 0.24869609464444728, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.5341882705688477, + "learning_rate": 8.283170835099618e-07, + "loss": 0.3791, + "mean_token_accuracy": 0.8742426037788391, + "num_tokens": 74663293.0, + "step": 1955 + }, + { + "epoch": 0.24882330492303778, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.6196227073669434, + "learning_rate": 8.287409919457396e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.8521634936332703, + "num_tokens": 74702345.0, + "step": 1956 + }, + { + "epoch": 0.24895051520162828, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.6063079833984375, + "learning_rate": 8.291649003815175e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8489371538162231, + "num_tokens": 74740806.0, + "step": 1957 + }, + { + "epoch": 0.2490777254802188, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.753378987312317, + "learning_rate": 8.295888088172954e-07, + "loss": 0.4335, + "mean_token_accuracy": 0.8600181341171265, + "num_tokens": 74771511.0, + "step": 1958 + }, + { + "epoch": 0.2492049357588093, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.6516578197479248, + "learning_rate": 8.300127172530733e-07, + "loss": 0.4277, + "mean_token_accuracy": 0.8578579425811768, + "num_tokens": 74807725.0, + "step": 1959 + }, + { + "epoch": 0.2493321460373998, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.4991737604141235, + "learning_rate": 8.304366256888512e-07, + "loss": 0.4355, + "mean_token_accuracy": 0.857132077217102, + "num_tokens": 74848841.0, + "step": 1960 + }, + { + "epoch": 0.24945935631599034, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.6231889724731445, + "learning_rate": 8.308605341246291e-07, + "loss": 0.4412, + "mean_token_accuracy": 0.850735068321228, + "num_tokens": 74887884.0, + "step": 1961 + }, + { + "epoch": 0.24958656659458084, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.6864173412322998, + "learning_rate": 8.312844425604068e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.8540343642234802, + "num_tokens": 74922876.0, + "step": 1962 + }, + { + "epoch": 0.24971377687317134, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.6473006010055542, + "learning_rate": 8.317083509961848e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8458131551742554, + "num_tokens": 74961688.0, + "step": 1963 + }, + { + "epoch": 0.24984098715176187, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.498902678489685, + "learning_rate": 8.321322594319626e-07, + "loss": 0.3885, + "mean_token_accuracy": 0.8692903518676758, + "num_tokens": 75008625.0, + "step": 1964 + }, + { + "epoch": 0.24996819743035237, + "ewc_loss": 7.539987564086914e-06, + "grad_norm": 1.6605839729309082, + "learning_rate": 8.325561678677405e-07, + "loss": 0.4684, + "mean_token_accuracy": 0.8475040793418884, + "num_tokens": 75047248.0, + "step": 1965 + }, + { + "epoch": 0.2500954077089429, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.8024828433990479, + "learning_rate": 8.329800763035184e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8398101925849915, + "num_tokens": 75081897.0, + "step": 1966 + }, + { + "epoch": 0.25022261798753337, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.6202329397201538, + "learning_rate": 8.334039847392963e-07, + "loss": 0.4551, + "mean_token_accuracy": 0.8469232320785522, + "num_tokens": 75119687.0, + "step": 1967 + }, + { + "epoch": 0.2503498282661239, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.648268461227417, + "learning_rate": 8.338278931750742e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.8476613759994507, + "num_tokens": 75157160.0, + "step": 1968 + }, + { + "epoch": 0.2504770385447144, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.6098177433013916, + "learning_rate": 8.342518016108521e-07, + "loss": 0.4312, + "mean_token_accuracy": 0.8585382699966431, + "num_tokens": 75195232.0, + "step": 1969 + }, + { + "epoch": 0.2506042488233049, + "ewc_loss": 7.569789886474609e-06, + "grad_norm": 1.6129130125045776, + "learning_rate": 8.346757100466298e-07, + "loss": 0.4611, + "mean_token_accuracy": 0.8456135988235474, + "num_tokens": 75231481.0, + "step": 1970 + }, + { + "epoch": 0.2507314591018954, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.8633408546447754, + "learning_rate": 8.350996184824078e-07, + "loss": 0.5222, + "mean_token_accuracy": 0.8308813571929932, + "num_tokens": 75265775.0, + "step": 1971 + }, + { + "epoch": 0.25085866938048595, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.5274828672409058, + "learning_rate": 8.355235269181856e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.8468543887138367, + "num_tokens": 75309391.0, + "step": 1972 + }, + { + "epoch": 0.2509858796590764, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.742155909538269, + "learning_rate": 8.359474353539635e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.8475897908210754, + "num_tokens": 75344033.0, + "step": 1973 + }, + { + "epoch": 0.25111308993766696, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.5873678922653198, + "learning_rate": 8.363713437897414e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8579157590866089, + "num_tokens": 75382684.0, + "step": 1974 + }, + { + "epoch": 0.2512403002162575, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7401535511016846, + "learning_rate": 8.367952522255193e-07, + "loss": 0.4285, + "mean_token_accuracy": 0.8605748414993286, + "num_tokens": 75417375.0, + "step": 1975 + }, + { + "epoch": 0.25136751049484796, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.4959203004837036, + "learning_rate": 8.372191606612972e-07, + "loss": 0.4248, + "mean_token_accuracy": 0.8620531558990479, + "num_tokens": 75456992.0, + "step": 1976 + }, + { + "epoch": 0.2514947207734385, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 2.5997819900512695, + "learning_rate": 8.376430690970749e-07, + "loss": 0.4129, + "mean_token_accuracy": 0.8617238998413086, + "num_tokens": 75492288.0, + "step": 1977 + }, + { + "epoch": 0.251621931052029, + "ewc_loss": 7.62939453125e-06, + "grad_norm": 1.7015509605407715, + "learning_rate": 8.380669775328528e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8479934930801392, + "num_tokens": 75526495.0, + "step": 1978 + }, + { + "epoch": 0.25174914133061954, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6999648809432983, + "learning_rate": 8.384908859686307e-07, + "loss": 0.4031, + "mean_token_accuracy": 0.8644927144050598, + "num_tokens": 75564145.0, + "step": 1979 + }, + { + "epoch": 0.25187635160921, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6039631366729736, + "learning_rate": 8.389147944044086e-07, + "loss": 0.4567, + "mean_token_accuracy": 0.8475788831710815, + "num_tokens": 75604406.0, + "step": 1980 + }, + { + "epoch": 0.25200356188780054, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.5686451196670532, + "learning_rate": 8.393387028401864e-07, + "loss": 0.435, + "mean_token_accuracy": 0.8604025840759277, + "num_tokens": 75648992.0, + "step": 1981 + }, + { + "epoch": 0.25213077216639107, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.5824742317199707, + "learning_rate": 8.397626112759644e-07, + "loss": 0.4673, + "mean_token_accuracy": 0.8478090763092041, + "num_tokens": 75689573.0, + "step": 1982 + }, + { + "epoch": 0.25225798244498154, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7237712144851685, + "learning_rate": 8.401865197117422e-07, + "loss": 0.4629, + "mean_token_accuracy": 0.8473491072654724, + "num_tokens": 75724798.0, + "step": 1983 + }, + { + "epoch": 0.25238519272357207, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.9673749208450317, + "learning_rate": 8.406104281475202e-07, + "loss": 0.407, + "mean_token_accuracy": 0.862555205821991, + "num_tokens": 75758640.0, + "step": 1984 + }, + { + "epoch": 0.2525124030021626, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.5934137105941772, + "learning_rate": 8.410343365832979e-07, + "loss": 0.4321, + "mean_token_accuracy": 0.8612078428268433, + "num_tokens": 75800684.0, + "step": 1985 + }, + { + "epoch": 0.2526396132807531, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.549428939819336, + "learning_rate": 8.414582450190758e-07, + "loss": 0.416, + "mean_token_accuracy": 0.8640824556350708, + "num_tokens": 75841086.0, + "step": 1986 + }, + { + "epoch": 0.2527668235593436, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7212767601013184, + "learning_rate": 8.418821534548537e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.842760443687439, + "num_tokens": 75877282.0, + "step": 1987 + }, + { + "epoch": 0.25289403383793413, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7959998846054077, + "learning_rate": 8.423060618906316e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8381898403167725, + "num_tokens": 75911668.0, + "step": 1988 + }, + { + "epoch": 0.2530212441165246, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6412643194198608, + "learning_rate": 8.427299703264095e-07, + "loss": 0.4483, + "mean_token_accuracy": 0.8508341908454895, + "num_tokens": 75950076.0, + "step": 1989 + }, + { + "epoch": 0.25314845439511513, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.595723271369934, + "learning_rate": 8.431538787621874e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8480353355407715, + "num_tokens": 75993117.0, + "step": 1990 + }, + { + "epoch": 0.25327566467370566, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.6749927997589111, + "learning_rate": 8.435777871979652e-07, + "loss": 0.4411, + "mean_token_accuracy": 0.856717586517334, + "num_tokens": 76030442.0, + "step": 1991 + }, + { + "epoch": 0.25340287495229613, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7470017671585083, + "learning_rate": 8.440016956337432e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8478084802627563, + "num_tokens": 76065457.0, + "step": 1992 + }, + { + "epoch": 0.25353008523088666, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.7458152770996094, + "learning_rate": 8.444256040695209e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.8264410495758057, + "num_tokens": 76102586.0, + "step": 1993 + }, + { + "epoch": 0.2536572955094772, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.5650711059570312, + "learning_rate": 8.448495125052988e-07, + "loss": 0.4881, + "mean_token_accuracy": 0.8420224189758301, + "num_tokens": 76144102.0, + "step": 1994 + }, + { + "epoch": 0.25378450578806766, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.678061842918396, + "learning_rate": 8.452734209410767e-07, + "loss": 0.3958, + "mean_token_accuracy": 0.8642343878746033, + "num_tokens": 76176689.0, + "step": 1995 + }, + { + "epoch": 0.2539117160666582, + "ewc_loss": 7.68899917602539e-06, + "grad_norm": 1.5676151514053345, + "learning_rate": 8.456973293768545e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8461109399795532, + "num_tokens": 76215587.0, + "step": 1996 + }, + { + "epoch": 0.2540389263452487, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.8317493200302124, + "learning_rate": 8.461212378126325e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8310362100601196, + "num_tokens": 76252488.0, + "step": 1997 + }, + { + "epoch": 0.2541661366238392, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7869640588760376, + "learning_rate": 8.465451462484103e-07, + "loss": 0.473, + "mean_token_accuracy": 0.8446865677833557, + "num_tokens": 76286071.0, + "step": 1998 + }, + { + "epoch": 0.2542933469024297, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5750529766082764, + "learning_rate": 8.469690546841882e-07, + "loss": 0.4765, + "mean_token_accuracy": 0.8438339829444885, + "num_tokens": 76326691.0, + "step": 1999 + }, + { + "epoch": 0.25442055718102025, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.508714199066162, + "learning_rate": 8.47392963119966e-07, + "loss": 0.4756, + "mean_token_accuracy": 0.8437099456787109, + "num_tokens": 76370703.0, + "step": 2000 + }, + { + "epoch": 0.2545477674596107, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.470261812210083, + "learning_rate": 8.478168715557439e-07, + "loss": 0.4247, + "mean_token_accuracy": 0.8577907681465149, + "num_tokens": 76413844.0, + "step": 2001 + }, + { + "epoch": 0.25467497773820125, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.59943687915802, + "learning_rate": 8.482407799915217e-07, + "loss": 0.4454, + "mean_token_accuracy": 0.8569930791854858, + "num_tokens": 76451189.0, + "step": 2002 + }, + { + "epoch": 0.2548021880167918, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6210483312606812, + "learning_rate": 8.486646884272997e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8451650738716125, + "num_tokens": 76490199.0, + "step": 2003 + }, + { + "epoch": 0.25492939829538225, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6551196575164795, + "learning_rate": 8.490885968630775e-07, + "loss": 0.4296, + "mean_token_accuracy": 0.8609645962715149, + "num_tokens": 76527107.0, + "step": 2004 + }, + { + "epoch": 0.2550566085739728, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.760652780532837, + "learning_rate": 8.495125052988555e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8435173034667969, + "num_tokens": 76561906.0, + "step": 2005 + }, + { + "epoch": 0.2551838188525633, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6458057165145874, + "learning_rate": 8.499364137346333e-07, + "loss": 0.4265, + "mean_token_accuracy": 0.8550559282302856, + "num_tokens": 76603096.0, + "step": 2006 + }, + { + "epoch": 0.2553110291311538, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5897587537765503, + "learning_rate": 8.503603221704112e-07, + "loss": 0.4282, + "mean_token_accuracy": 0.8578032851219177, + "num_tokens": 76640935.0, + "step": 2007 + }, + { + "epoch": 0.2554382394097443, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6198325157165527, + "learning_rate": 8.50784230606189e-07, + "loss": 0.4652, + "mean_token_accuracy": 0.8487310409545898, + "num_tokens": 76680759.0, + "step": 2008 + }, + { + "epoch": 0.25556544968833483, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5809857845306396, + "learning_rate": 8.512081390419669e-07, + "loss": 0.4323, + "mean_token_accuracy": 0.8610707521438599, + "num_tokens": 76720753.0, + "step": 2009 + }, + { + "epoch": 0.2556926599669253, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.624289870262146, + "learning_rate": 8.516320474777447e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8545212745666504, + "num_tokens": 76756934.0, + "step": 2010 + }, + { + "epoch": 0.25581987024551583, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5304005146026611, + "learning_rate": 8.520559559135227e-07, + "loss": 0.3829, + "mean_token_accuracy": 0.872692346572876, + "num_tokens": 76795131.0, + "step": 2011 + }, + { + "epoch": 0.25594708052410636, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.56168532371521, + "learning_rate": 8.524798643493005e-07, + "loss": 0.3812, + "mean_token_accuracy": 0.871605634689331, + "num_tokens": 76836550.0, + "step": 2012 + }, + { + "epoch": 0.25607429080269684, + "ewc_loss": 7.808208465576172e-06, + "grad_norm": 2.3374924659729004, + "learning_rate": 8.529037727850785e-07, + "loss": 0.456, + "mean_token_accuracy": 0.8467072248458862, + "num_tokens": 76876908.0, + "step": 2013 + }, + { + "epoch": 0.25620150108128736, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.587814450263977, + "learning_rate": 8.533276812208563e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8502074480056763, + "num_tokens": 76917415.0, + "step": 2014 + }, + { + "epoch": 0.2563287113598779, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7297122478485107, + "learning_rate": 8.53751589656634e-07, + "loss": 0.4239, + "mean_token_accuracy": 0.859870970249176, + "num_tokens": 76949010.0, + "step": 2015 + }, + { + "epoch": 0.25645592163846836, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.650559425354004, + "learning_rate": 8.54175498092412e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8556158542633057, + "num_tokens": 76990031.0, + "step": 2016 + }, + { + "epoch": 0.2565831319170589, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.4898781776428223, + "learning_rate": 8.545994065281898e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8522152900695801, + "num_tokens": 77030117.0, + "step": 2017 + }, + { + "epoch": 0.2567103421956494, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5733577013015747, + "learning_rate": 8.550233149639677e-07, + "loss": 0.4157, + "mean_token_accuracy": 0.8622289896011353, + "num_tokens": 77069793.0, + "step": 2018 + }, + { + "epoch": 0.2568375524742399, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.665446400642395, + "learning_rate": 8.554472233997456e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8447234630584717, + "num_tokens": 77105332.0, + "step": 2019 + }, + { + "epoch": 0.2569647627528304, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6165827512741089, + "learning_rate": 8.558711318355235e-07, + "loss": 0.4291, + "mean_token_accuracy": 0.8592904806137085, + "num_tokens": 77147678.0, + "step": 2020 + }, + { + "epoch": 0.25709197303142095, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.7360694408416748, + "learning_rate": 8.562950402713014e-07, + "loss": 0.4142, + "mean_token_accuracy": 0.861160397529602, + "num_tokens": 77183805.0, + "step": 2021 + }, + { + "epoch": 0.2572191833100114, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.572462797164917, + "learning_rate": 8.567189487070793e-07, + "loss": 0.4239, + "mean_token_accuracy": 0.862636923789978, + "num_tokens": 77224899.0, + "step": 2022 + }, + { + "epoch": 0.25734639358860195, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5340688228607178, + "learning_rate": 8.57142857142857e-07, + "loss": 0.429, + "mean_token_accuracy": 0.8582162857055664, + "num_tokens": 77261351.0, + "step": 2023 + }, + { + "epoch": 0.2574736038671925, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5923759937286377, + "learning_rate": 8.57566765578635e-07, + "loss": 0.4495, + "mean_token_accuracy": 0.8539731502532959, + "num_tokens": 77295632.0, + "step": 2024 + }, + { + "epoch": 0.25760081414578295, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.5206204652786255, + "learning_rate": 8.579906740144128e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8451204299926758, + "num_tokens": 77336614.0, + "step": 2025 + }, + { + "epoch": 0.2577280244243735, + "ewc_loss": 7.748603820800781e-06, + "grad_norm": 1.6050677299499512, + "learning_rate": 8.584145824501907e-07, + "loss": 0.3921, + "mean_token_accuracy": 0.867546558380127, + "num_tokens": 77376821.0, + "step": 2026 + }, + { + "epoch": 0.257855234702964, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.5820034742355347, + "learning_rate": 8.588384908859686e-07, + "loss": 0.3921, + "mean_token_accuracy": 0.8695704936981201, + "num_tokens": 77415098.0, + "step": 2027 + }, + { + "epoch": 0.25798244498155454, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.7179477214813232, + "learning_rate": 8.592623993217465e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8474771976470947, + "num_tokens": 77453579.0, + "step": 2028 + }, + { + "epoch": 0.258109655260145, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.6417001485824585, + "learning_rate": 8.596863077575244e-07, + "loss": 0.4207, + "mean_token_accuracy": 0.8614480495452881, + "num_tokens": 77495296.0, + "step": 2029 + }, + { + "epoch": 0.25823686553873554, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.6023478507995605, + "learning_rate": 8.601102161933023e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.8295204639434814, + "num_tokens": 77539262.0, + "step": 2030 + }, + { + "epoch": 0.25836407581732607, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.6573652029037476, + "learning_rate": 8.6053412462908e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8536758422851562, + "num_tokens": 77576012.0, + "step": 2031 + }, + { + "epoch": 0.25849128609591654, + "ewc_loss": 7.867813110351562e-06, + "grad_norm": 1.5348528623580933, + "learning_rate": 8.60958033064858e-07, + "loss": 0.4677, + "mean_token_accuracy": 0.8472705483436584, + "num_tokens": 77617587.0, + "step": 2032 + }, + { + "epoch": 0.25861849637450707, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6735869646072388, + "learning_rate": 8.613819415006358e-07, + "loss": 0.3944, + "mean_token_accuracy": 0.8689647316932678, + "num_tokens": 77654211.0, + "step": 2033 + }, + { + "epoch": 0.2587457066530976, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7811555862426758, + "learning_rate": 8.618058499364137e-07, + "loss": 0.4461, + "mean_token_accuracy": 0.8530381917953491, + "num_tokens": 77695956.0, + "step": 2034 + }, + { + "epoch": 0.25887291693168807, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6925345659255981, + "learning_rate": 8.622297583721916e-07, + "loss": 0.394, + "mean_token_accuracy": 0.8699683547019958, + "num_tokens": 77728686.0, + "step": 2035 + }, + { + "epoch": 0.2590001272102786, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8360259532928467, + "learning_rate": 8.626536668079695e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.8507569432258606, + "num_tokens": 77759106.0, + "step": 2036 + }, + { + "epoch": 0.2591273374888691, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.659480094909668, + "learning_rate": 8.630775752437474e-07, + "loss": 0.4221, + "mean_token_accuracy": 0.8625160455703735, + "num_tokens": 77794031.0, + "step": 2037 + }, + { + "epoch": 0.2592545477674596, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.828405737876892, + "learning_rate": 8.635014836795251e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8531254529953003, + "num_tokens": 77828593.0, + "step": 2038 + }, + { + "epoch": 0.2593817580460501, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8892723321914673, + "learning_rate": 8.63925392115303e-07, + "loss": 0.4998, + "mean_token_accuracy": 0.8391843438148499, + "num_tokens": 77865118.0, + "step": 2039 + }, + { + "epoch": 0.25950896832464065, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.710943579673767, + "learning_rate": 8.643493005510809e-07, + "loss": 0.428, + "mean_token_accuracy": 0.8584110140800476, + "num_tokens": 77902068.0, + "step": 2040 + }, + { + "epoch": 0.2596361786032311, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7415566444396973, + "learning_rate": 8.647732089868588e-07, + "loss": 0.478, + "mean_token_accuracy": 0.8435733318328857, + "num_tokens": 77940973.0, + "step": 2041 + }, + { + "epoch": 0.25976338888182166, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.5228590965270996, + "learning_rate": 8.651971174226366e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8472337126731873, + "num_tokens": 77986191.0, + "step": 2042 + }, + { + "epoch": 0.2598905991604122, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.7425522804260254, + "learning_rate": 8.656210258584146e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8459008932113647, + "num_tokens": 78024873.0, + "step": 2043 + }, + { + "epoch": 0.26001780943900266, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6332775354385376, + "learning_rate": 8.660449342941924e-07, + "loss": 0.4327, + "mean_token_accuracy": 0.8571938276290894, + "num_tokens": 78065633.0, + "step": 2044 + }, + { + "epoch": 0.2601450197175932, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.658612608909607, + "learning_rate": 8.664688427299704e-07, + "loss": 0.5106, + "mean_token_accuracy": 0.8360006809234619, + "num_tokens": 78111281.0, + "step": 2045 + }, + { + "epoch": 0.2602722299961837, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.647533655166626, + "learning_rate": 8.668927511657481e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8520078659057617, + "num_tokens": 78147525.0, + "step": 2046 + }, + { + "epoch": 0.2603994402747742, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6906557083129883, + "learning_rate": 8.67316659601526e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8437169790267944, + "num_tokens": 78183850.0, + "step": 2047 + }, + { + "epoch": 0.2605266505533647, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.8286128044128418, + "learning_rate": 8.677405680373039e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.852820098400116, + "num_tokens": 78220616.0, + "step": 2048 + }, + { + "epoch": 0.26065386083195524, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.729914903640747, + "learning_rate": 8.681644764730818e-07, + "loss": 0.4261, + "mean_token_accuracy": 0.8585190176963806, + "num_tokens": 78258352.0, + "step": 2049 + }, + { + "epoch": 0.2607810711105457, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6093336343765259, + "learning_rate": 8.685883849088596e-07, + "loss": 0.4051, + "mean_token_accuracy": 0.8641334772109985, + "num_tokens": 78292773.0, + "step": 2050 + }, + { + "epoch": 0.26090828138913624, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.9110716581344604, + "learning_rate": 8.690122933446376e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.838546097278595, + "num_tokens": 78324060.0, + "step": 2051 + }, + { + "epoch": 0.26103549166772677, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.7037535905838013, + "learning_rate": 8.694362017804154e-07, + "loss": 0.4243, + "mean_token_accuracy": 0.8595883250236511, + "num_tokens": 78356401.0, + "step": 2052 + }, + { + "epoch": 0.26116270194631724, + "ewc_loss": 7.927417755126953e-06, + "grad_norm": 1.6838417053222656, + "learning_rate": 8.698601102161933e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.860589861869812, + "num_tokens": 78393135.0, + "step": 2053 + }, + { + "epoch": 0.26128991222490777, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.606276273727417, + "learning_rate": 8.702840186519711e-07, + "loss": 0.4562, + "mean_token_accuracy": 0.8499807119369507, + "num_tokens": 78431847.0, + "step": 2054 + }, + { + "epoch": 0.2614171225034983, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.567887783050537, + "learning_rate": 8.70707927087749e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8367767930030823, + "num_tokens": 78476701.0, + "step": 2055 + }, + { + "epoch": 0.2615443327820888, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.588518738746643, + "learning_rate": 8.711318355235269e-07, + "loss": 0.3825, + "mean_token_accuracy": 0.8718855381011963, + "num_tokens": 78509911.0, + "step": 2056 + }, + { + "epoch": 0.2616715430606793, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.6974672079086304, + "learning_rate": 8.715557439593047e-07, + "loss": 0.4188, + "mean_token_accuracy": 0.8616715669631958, + "num_tokens": 78547405.0, + "step": 2057 + }, + { + "epoch": 0.26179875333926983, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.6090669631958008, + "learning_rate": 8.719796523950826e-07, + "loss": 0.4341, + "mean_token_accuracy": 0.8575261831283569, + "num_tokens": 78583991.0, + "step": 2058 + }, + { + "epoch": 0.2619259636178603, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.5279027223587036, + "learning_rate": 8.724035608308605e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8482899069786072, + "num_tokens": 78624214.0, + "step": 2059 + }, + { + "epoch": 0.26205317389645083, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.5556639432907104, + "learning_rate": 8.728274692666384e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8523934483528137, + "num_tokens": 78666607.0, + "step": 2060 + }, + { + "epoch": 0.26218038417504136, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.6146676540374756, + "learning_rate": 8.732513777024162e-07, + "loss": 0.4175, + "mean_token_accuracy": 0.8606400489807129, + "num_tokens": 78704631.0, + "step": 2061 + }, + { + "epoch": 0.26230759445363183, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.5821152925491333, + "learning_rate": 8.736752861381941e-07, + "loss": 0.4176, + "mean_token_accuracy": 0.8613097667694092, + "num_tokens": 78751478.0, + "step": 2062 + }, + { + "epoch": 0.26243480473222236, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.720558762550354, + "learning_rate": 8.740991945739719e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.849943220615387, + "num_tokens": 78789211.0, + "step": 2063 + }, + { + "epoch": 0.2625620150108129, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.742408037185669, + "learning_rate": 8.745231030097499e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.845332682132721, + "num_tokens": 78824184.0, + "step": 2064 + }, + { + "epoch": 0.26268922528940336, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.4571329355239868, + "learning_rate": 8.749470114455277e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.8508235216140747, + "num_tokens": 78871830.0, + "step": 2065 + }, + { + "epoch": 0.2628164355679939, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.6703861951828003, + "learning_rate": 8.753709198813056e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.8498894572257996, + "num_tokens": 78911880.0, + "step": 2066 + }, + { + "epoch": 0.2629436458465844, + "ewc_loss": 7.987022399902344e-06, + "grad_norm": 1.4792258739471436, + "learning_rate": 8.757948283170835e-07, + "loss": 0.3927, + "mean_token_accuracy": 0.8706855177879333, + "num_tokens": 78953613.0, + "step": 2067 + }, + { + "epoch": 0.2630708561251749, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.616229772567749, + "learning_rate": 8.762187367528613e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8551833033561707, + "num_tokens": 78999736.0, + "step": 2068 + }, + { + "epoch": 0.2631980664037654, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.5893230438232422, + "learning_rate": 8.766426451886392e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8445809483528137, + "num_tokens": 79038811.0, + "step": 2069 + }, + { + "epoch": 0.26332527668235595, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.5501383543014526, + "learning_rate": 8.770665536244171e-07, + "loss": 0.4106, + "mean_token_accuracy": 0.8659621477127075, + "num_tokens": 79078956.0, + "step": 2070 + }, + { + "epoch": 0.2634524869609464, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.6749483346939087, + "learning_rate": 8.774904620601949e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8322762250900269, + "num_tokens": 79119646.0, + "step": 2071 + }, + { + "epoch": 0.26357969723953695, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.6547240018844604, + "learning_rate": 8.779143704959729e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.8515093922615051, + "num_tokens": 79159531.0, + "step": 2072 + }, + { + "epoch": 0.2637069075181275, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.6599448919296265, + "learning_rate": 8.783382789317507e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8476039171218872, + "num_tokens": 79200195.0, + "step": 2073 + }, + { + "epoch": 0.26383411779671795, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.6765637397766113, + "learning_rate": 8.787621873675286e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8361822366714478, + "num_tokens": 79243496.0, + "step": 2074 + }, + { + "epoch": 0.2639613280753085, + "ewc_loss": 8.046627044677734e-06, + "grad_norm": 1.561104416847229, + "learning_rate": 8.791860958033065e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.8525477647781372, + "num_tokens": 79285239.0, + "step": 2075 + }, + { + "epoch": 0.264088538353899, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.572892189025879, + "learning_rate": 8.796100042390842e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8467797636985779, + "num_tokens": 79327601.0, + "step": 2076 + }, + { + "epoch": 0.2642157486324895, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 3.6514899730682373, + "learning_rate": 8.800339126748622e-07, + "loss": 0.446, + "mean_token_accuracy": 0.8548214435577393, + "num_tokens": 79366348.0, + "step": 2077 + }, + { + "epoch": 0.26434295891108, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.746490478515625, + "learning_rate": 8.8045782111064e-07, + "loss": 0.4462, + "mean_token_accuracy": 0.8524472117424011, + "num_tokens": 79401116.0, + "step": 2078 + }, + { + "epoch": 0.26447016918967053, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.5963839292526245, + "learning_rate": 8.808817295464179e-07, + "loss": 0.4213, + "mean_token_accuracy": 0.8604996204376221, + "num_tokens": 79439306.0, + "step": 2079 + }, + { + "epoch": 0.26459737946826106, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.8523428440093994, + "learning_rate": 8.813056379821958e-07, + "loss": 0.4776, + "mean_token_accuracy": 0.8411754965782166, + "num_tokens": 79472724.0, + "step": 2080 + }, + { + "epoch": 0.26472458974685154, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6220743656158447, + "learning_rate": 8.817295464179737e-07, + "loss": 0.4707, + "mean_token_accuracy": 0.8475834131240845, + "num_tokens": 79513497.0, + "step": 2081 + }, + { + "epoch": 0.26485180002544206, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.609031319618225, + "learning_rate": 8.821534548537515e-07, + "loss": 0.4301, + "mean_token_accuracy": 0.8563638925552368, + "num_tokens": 79555921.0, + "step": 2082 + }, + { + "epoch": 0.2649790103040326, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.661561369895935, + "learning_rate": 8.825773632895295e-07, + "loss": 0.4466, + "mean_token_accuracy": 0.8534406423568726, + "num_tokens": 79593275.0, + "step": 2083 + }, + { + "epoch": 0.26510622058262306, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6868960857391357, + "learning_rate": 8.830012717253072e-07, + "loss": 0.4255, + "mean_token_accuracy": 0.8594416975975037, + "num_tokens": 79630127.0, + "step": 2084 + }, + { + "epoch": 0.2652334308612136, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6064972877502441, + "learning_rate": 8.834251801610852e-07, + "loss": 0.4725, + "mean_token_accuracy": 0.8449234366416931, + "num_tokens": 79671707.0, + "step": 2085 + }, + { + "epoch": 0.2653606411398041, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.859556794166565, + "learning_rate": 8.83849088596863e-07, + "loss": 0.5093, + "mean_token_accuracy": 0.835106611251831, + "num_tokens": 79707742.0, + "step": 2086 + }, + { + "epoch": 0.2654878514183946, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6294947862625122, + "learning_rate": 8.842729970326409e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8466440439224243, + "num_tokens": 79744949.0, + "step": 2087 + }, + { + "epoch": 0.2656150616969851, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6528947353363037, + "learning_rate": 8.846969054684188e-07, + "loss": 0.4847, + "mean_token_accuracy": 0.8425529599189758, + "num_tokens": 79782428.0, + "step": 2088 + }, + { + "epoch": 0.26574227197557565, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.5162134170532227, + "learning_rate": 8.851208139041967e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8593027591705322, + "num_tokens": 79823947.0, + "step": 2089 + }, + { + "epoch": 0.2658694822541661, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.801471471786499, + "learning_rate": 8.855447223399745e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8487387299537659, + "num_tokens": 79859922.0, + "step": 2090 + }, + { + "epoch": 0.26599669253275665, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.384703516960144, + "learning_rate": 8.859686307757524e-07, + "loss": 0.4147, + "mean_token_accuracy": 0.8624261021614075, + "num_tokens": 79910134.0, + "step": 2091 + }, + { + "epoch": 0.2661239028113472, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6911975145339966, + "learning_rate": 8.863925392115302e-07, + "loss": 0.4415, + "mean_token_accuracy": 0.8542316555976868, + "num_tokens": 79943741.0, + "step": 2092 + }, + { + "epoch": 0.26625111308993765, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.5744835138320923, + "learning_rate": 8.868164476473082e-07, + "loss": 0.4425, + "mean_token_accuracy": 0.8534245491027832, + "num_tokens": 79985465.0, + "step": 2093 + }, + { + "epoch": 0.2663783233685282, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6471387147903442, + "learning_rate": 8.87240356083086e-07, + "loss": 0.4628, + "mean_token_accuracy": 0.847936749458313, + "num_tokens": 80021969.0, + "step": 2094 + }, + { + "epoch": 0.2665055336471187, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.687893271446228, + "learning_rate": 8.876642645188639e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8562572598457336, + "num_tokens": 80058216.0, + "step": 2095 + }, + { + "epoch": 0.2666327439257092, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.626670002937317, + "learning_rate": 8.880881729546418e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.8510215282440186, + "num_tokens": 80095643.0, + "step": 2096 + }, + { + "epoch": 0.2667599542042997, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.6434770822525024, + "learning_rate": 8.885120813904197e-07, + "loss": 0.4526, + "mean_token_accuracy": 0.8491331338882446, + "num_tokens": 80132590.0, + "step": 2097 + }, + { + "epoch": 0.26688716448289024, + "ewc_loss": 8.106231689453125e-06, + "grad_norm": 1.7299448251724243, + "learning_rate": 8.889359898261976e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.8481138348579407, + "num_tokens": 80167867.0, + "step": 2098 + }, + { + "epoch": 0.2670143747614807, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.9289087057113647, + "learning_rate": 8.893598982619753e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8462235927581787, + "num_tokens": 80200264.0, + "step": 2099 + }, + { + "epoch": 0.26714158504007124, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.7708677053451538, + "learning_rate": 8.897838066977532e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8495360612869263, + "num_tokens": 80234022.0, + "step": 2100 + }, + { + "epoch": 0.26726879531866177, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.6270253658294678, + "learning_rate": 8.902077151335311e-07, + "loss": 0.4917, + "mean_token_accuracy": 0.8352928757667542, + "num_tokens": 80273918.0, + "step": 2101 + }, + { + "epoch": 0.26739600559725224, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.642318844795227, + "learning_rate": 8.90631623569309e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8561658263206482, + "num_tokens": 80310401.0, + "step": 2102 + }, + { + "epoch": 0.26752321587584277, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.4679406881332397, + "learning_rate": 8.910555320050868e-07, + "loss": 0.3974, + "mean_token_accuracy": 0.868755578994751, + "num_tokens": 80351165.0, + "step": 2103 + }, + { + "epoch": 0.2676504261544333, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 1.5528934001922607, + "learning_rate": 8.914794404408648e-07, + "loss": 0.4448, + "mean_token_accuracy": 0.8500210046768188, + "num_tokens": 80390345.0, + "step": 2104 + }, + { + "epoch": 0.26777763643302377, + "ewc_loss": 8.165836334228516e-06, + "grad_norm": 3.7151496410369873, + "learning_rate": 8.919033488766426e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.8569791316986084, + "num_tokens": 80433146.0, + "step": 2105 + }, + { + "epoch": 0.2679048467116143, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.64710533618927, + "learning_rate": 8.923272573124204e-07, + "loss": 0.4222, + "mean_token_accuracy": 0.8615517616271973, + "num_tokens": 80470380.0, + "step": 2106 + }, + { + "epoch": 0.2680320569902048, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6437103748321533, + "learning_rate": 8.927511657481983e-07, + "loss": 0.4152, + "mean_token_accuracy": 0.8632318377494812, + "num_tokens": 80504061.0, + "step": 2107 + }, + { + "epoch": 0.2681592672687953, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.617153525352478, + "learning_rate": 8.931750741839762e-07, + "loss": 0.4396, + "mean_token_accuracy": 0.8555027842521667, + "num_tokens": 80537797.0, + "step": 2108 + }, + { + "epoch": 0.2682864775473858, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.4960978031158447, + "learning_rate": 8.935989826197541e-07, + "loss": 0.4623, + "mean_token_accuracy": 0.8491063117980957, + "num_tokens": 80584893.0, + "step": 2109 + }, + { + "epoch": 0.26841368782597635, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.636893630027771, + "learning_rate": 8.94022891055532e-07, + "loss": 0.4298, + "mean_token_accuracy": 0.8566321134567261, + "num_tokens": 80624005.0, + "step": 2110 + }, + { + "epoch": 0.2685408981045668, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.464774489402771, + "learning_rate": 8.944467994913098e-07, + "loss": 0.4281, + "mean_token_accuracy": 0.858060359954834, + "num_tokens": 80670011.0, + "step": 2111 + }, + { + "epoch": 0.26866810838315736, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.642883539199829, + "learning_rate": 8.948707079270878e-07, + "loss": 0.4059, + "mean_token_accuracy": 0.8647217154502869, + "num_tokens": 80705827.0, + "step": 2112 + }, + { + "epoch": 0.2687953186617479, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.685475468635559, + "learning_rate": 8.952946163628656e-07, + "loss": 0.3978, + "mean_token_accuracy": 0.8683629035949707, + "num_tokens": 80737656.0, + "step": 2113 + }, + { + "epoch": 0.26892252894033836, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.6983740329742432, + "learning_rate": 8.957185247986434e-07, + "loss": 0.4664, + "mean_token_accuracy": 0.8459116816520691, + "num_tokens": 80776014.0, + "step": 2114 + }, + { + "epoch": 0.2690497392189289, + "ewc_loss": 8.225440979003906e-06, + "grad_norm": 1.5938481092453003, + "learning_rate": 8.961424332344213e-07, + "loss": 0.4321, + "mean_token_accuracy": 0.8562323451042175, + "num_tokens": 80815166.0, + "step": 2115 + }, + { + "epoch": 0.2691769494975194, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.5695173740386963, + "learning_rate": 8.965663416701992e-07, + "loss": 0.413, + "mean_token_accuracy": 0.8647029995918274, + "num_tokens": 80854962.0, + "step": 2116 + }, + { + "epoch": 0.2693041597761099, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6068652868270874, + "learning_rate": 8.969902501059771e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.845971941947937, + "num_tokens": 80895169.0, + "step": 2117 + }, + { + "epoch": 0.2694313700547004, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.7310431003570557, + "learning_rate": 8.97414158541755e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8445049524307251, + "num_tokens": 80931853.0, + "step": 2118 + }, + { + "epoch": 0.26955858033329094, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.530092477798462, + "learning_rate": 8.978380669775328e-07, + "loss": 0.431, + "mean_token_accuracy": 0.8587255477905273, + "num_tokens": 80974244.0, + "step": 2119 + }, + { + "epoch": 0.2696857906118814, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.5436378717422485, + "learning_rate": 8.982619754133107e-07, + "loss": 0.401, + "mean_token_accuracy": 0.8672214150428772, + "num_tokens": 81014539.0, + "step": 2120 + }, + { + "epoch": 0.26981300089047194, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.5857397317886353, + "learning_rate": 8.986858838490886e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8508593440055847, + "num_tokens": 81052984.0, + "step": 2121 + }, + { + "epoch": 0.26994021116906247, + "ewc_loss": 8.285045623779297e-06, + "grad_norm": 1.6365687847137451, + "learning_rate": 8.991097922848663e-07, + "loss": 0.5112, + "mean_token_accuracy": 0.8317818641662598, + "num_tokens": 81095422.0, + "step": 2122 + }, + { + "epoch": 0.27006742144765294, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.6738775968551636, + "learning_rate": 8.995337007206443e-07, + "loss": 0.4311, + "mean_token_accuracy": 0.8586889505386353, + "num_tokens": 81128900.0, + "step": 2123 + }, + { + "epoch": 0.2701946317262435, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.508846640586853, + "learning_rate": 8.999576091564221e-07, + "loss": 0.4148, + "mean_token_accuracy": 0.8639904856681824, + "num_tokens": 81169217.0, + "step": 2124 + }, + { + "epoch": 0.270321842004834, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6332244873046875, + "learning_rate": 9.003815175922001e-07, + "loss": 0.4052, + "mean_token_accuracy": 0.8662143349647522, + "num_tokens": 81205649.0, + "step": 2125 + }, + { + "epoch": 0.2704490522834245, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.6772525310516357, + "learning_rate": 9.008054260279779e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.858202338218689, + "num_tokens": 81241665.0, + "step": 2126 + }, + { + "epoch": 0.270576262562015, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.7168935537338257, + "learning_rate": 9.012293344637558e-07, + "loss": 0.4558, + "mean_token_accuracy": 0.8483327627182007, + "num_tokens": 81275632.0, + "step": 2127 + }, + { + "epoch": 0.27070347284060553, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.7013740539550781, + "learning_rate": 9.016532428995337e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8417931199073792, + "num_tokens": 81312461.0, + "step": 2128 + }, + { + "epoch": 0.27083068311919606, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.5799732208251953, + "learning_rate": 9.020771513353115e-07, + "loss": 0.4411, + "mean_token_accuracy": 0.8553929924964905, + "num_tokens": 81356428.0, + "step": 2129 + }, + { + "epoch": 0.27095789339778653, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.6561918258666992, + "learning_rate": 9.025010597710894e-07, + "loss": 0.4077, + "mean_token_accuracy": 0.8636533617973328, + "num_tokens": 81388381.0, + "step": 2130 + }, + { + "epoch": 0.27108510367637706, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.4771728515625, + "learning_rate": 9.029249682068673e-07, + "loss": 0.4125, + "mean_token_accuracy": 0.8655287027359009, + "num_tokens": 81431824.0, + "step": 2131 + }, + { + "epoch": 0.2712123139549676, + "ewc_loss": 8.344650268554688e-06, + "grad_norm": 1.621626377105713, + "learning_rate": 9.033488766426451e-07, + "loss": 0.4386, + "mean_token_accuracy": 0.8541222810745239, + "num_tokens": 81470408.0, + "step": 2132 + }, + { + "epoch": 0.27133952423355806, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6866940259933472, + "learning_rate": 9.037727850784231e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8427181839942932, + "num_tokens": 81508932.0, + "step": 2133 + }, + { + "epoch": 0.2714667345121486, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6041456460952759, + "learning_rate": 9.041966935142009e-07, + "loss": 0.3972, + "mean_token_accuracy": 0.8680351376533508, + "num_tokens": 81542909.0, + "step": 2134 + }, + { + "epoch": 0.2715939447907391, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.751837968826294, + "learning_rate": 9.046206019499788e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8477240800857544, + "num_tokens": 81581990.0, + "step": 2135 + }, + { + "epoch": 0.2717211550693296, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.694057583808899, + "learning_rate": 9.050445103857567e-07, + "loss": 0.4202, + "mean_token_accuracy": 0.8622873425483704, + "num_tokens": 81617114.0, + "step": 2136 + }, + { + "epoch": 0.2718483653479201, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6475040912628174, + "learning_rate": 9.054684188215344e-07, + "loss": 0.4392, + "mean_token_accuracy": 0.8598699569702148, + "num_tokens": 81655840.0, + "step": 2137 + }, + { + "epoch": 0.27197557562651065, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.5598851442337036, + "learning_rate": 9.058923272573124e-07, + "loss": 0.409, + "mean_token_accuracy": 0.8617292642593384, + "num_tokens": 81692883.0, + "step": 2138 + }, + { + "epoch": 0.2721027859051011, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.7382450103759766, + "learning_rate": 9.063162356930902e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8393329977989197, + "num_tokens": 81729215.0, + "step": 2139 + }, + { + "epoch": 0.27222999618369165, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.7447540760040283, + "learning_rate": 9.067401441288681e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8492128849029541, + "num_tokens": 81763451.0, + "step": 2140 + }, + { + "epoch": 0.2723572064622822, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.654649257659912, + "learning_rate": 9.07164052564646e-07, + "loss": 0.4459, + "mean_token_accuracy": 0.8563624620437622, + "num_tokens": 81807490.0, + "step": 2141 + }, + { + "epoch": 0.27248441674087265, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6883807182312012, + "learning_rate": 9.075879610004239e-07, + "loss": 0.4139, + "mean_token_accuracy": 0.863635241985321, + "num_tokens": 81847038.0, + "step": 2142 + }, + { + "epoch": 0.2726116270194632, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.8359830379486084, + "learning_rate": 9.080118694362017e-07, + "loss": 0.4543, + "mean_token_accuracy": 0.8481266498565674, + "num_tokens": 81879607.0, + "step": 2143 + }, + { + "epoch": 0.2727388372980537, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6007906198501587, + "learning_rate": 9.084357778719796e-07, + "loss": 0.441, + "mean_token_accuracy": 0.8526995182037354, + "num_tokens": 81916496.0, + "step": 2144 + }, + { + "epoch": 0.2728660475766442, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.5919517278671265, + "learning_rate": 9.088596863077574e-07, + "loss": 0.4262, + "mean_token_accuracy": 0.8591527342796326, + "num_tokens": 81957162.0, + "step": 2145 + }, + { + "epoch": 0.2729932578552347, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6485012769699097, + "learning_rate": 9.092835947435354e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8429667949676514, + "num_tokens": 81992564.0, + "step": 2146 + }, + { + "epoch": 0.27312046813382523, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.72803795337677, + "learning_rate": 9.097075031793132e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8316022753715515, + "num_tokens": 82029222.0, + "step": 2147 + }, + { + "epoch": 0.2732476784124157, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6622002124786377, + "learning_rate": 9.101314116150911e-07, + "loss": 0.4768, + "mean_token_accuracy": 0.8448535203933716, + "num_tokens": 82066413.0, + "step": 2148 + }, + { + "epoch": 0.27337488869100623, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6222971677780151, + "learning_rate": 9.10555320050869e-07, + "loss": 0.4524, + "mean_token_accuracy": 0.8490687608718872, + "num_tokens": 82103661.0, + "step": 2149 + }, + { + "epoch": 0.27350209896959676, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.5890848636627197, + "learning_rate": 9.109792284866469e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.8528842926025391, + "num_tokens": 82141415.0, + "step": 2150 + }, + { + "epoch": 0.27362930924818724, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6223260164260864, + "learning_rate": 9.114031369224247e-07, + "loss": 0.4551, + "mean_token_accuracy": 0.852187991142273, + "num_tokens": 82180129.0, + "step": 2151 + }, + { + "epoch": 0.27375651952677776, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.552873969078064, + "learning_rate": 9.118270453582026e-07, + "loss": 0.4679, + "mean_token_accuracy": 0.8493494391441345, + "num_tokens": 82222116.0, + "step": 2152 + }, + { + "epoch": 0.2738837298053683, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6477911472320557, + "learning_rate": 9.122509537939804e-07, + "loss": 0.4403, + "mean_token_accuracy": 0.8569415807723999, + "num_tokens": 82260579.0, + "step": 2153 + }, + { + "epoch": 0.27401094008395877, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 2.2672934532165527, + "learning_rate": 9.126748622297584e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8503676652908325, + "num_tokens": 82300460.0, + "step": 2154 + }, + { + "epoch": 0.2741381503625493, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.7119957208633423, + "learning_rate": 9.130987706655362e-07, + "loss": 0.4012, + "mean_token_accuracy": 0.8674372434616089, + "num_tokens": 82334407.0, + "step": 2155 + }, + { + "epoch": 0.2742653606411398, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.6116300821304321, + "learning_rate": 9.135226791013141e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8478936553001404, + "num_tokens": 82373263.0, + "step": 2156 + }, + { + "epoch": 0.2743925709197303, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.5714941024780273, + "learning_rate": 9.13946587537092e-07, + "loss": 0.4096, + "mean_token_accuracy": 0.8645254373550415, + "num_tokens": 82412878.0, + "step": 2157 + }, + { + "epoch": 0.2745197811983208, + "ewc_loss": 8.463859558105469e-06, + "grad_norm": 1.7697265148162842, + "learning_rate": 9.143704959728699e-07, + "loss": 0.4011, + "mean_token_accuracy": 0.8664923906326294, + "num_tokens": 82447310.0, + "step": 2158 + }, + { + "epoch": 0.27464699147691135, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 1.5711554288864136, + "learning_rate": 9.147944044086476e-07, + "loss": 0.4351, + "mean_token_accuracy": 0.8559471368789673, + "num_tokens": 82486035.0, + "step": 2159 + }, + { + "epoch": 0.2747742017555018, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 1.6049468517303467, + "learning_rate": 9.152183128444255e-07, + "loss": 0.4393, + "mean_token_accuracy": 0.8582533597946167, + "num_tokens": 82524839.0, + "step": 2160 + }, + { + "epoch": 0.27490141203409235, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 1.6891909837722778, + "learning_rate": 9.156422212802034e-07, + "loss": 0.4748, + "mean_token_accuracy": 0.8436779975891113, + "num_tokens": 82564649.0, + "step": 2161 + }, + { + "epoch": 0.2750286223126829, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 1.6490848064422607, + "learning_rate": 9.160661297159813e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8536991477012634, + "num_tokens": 82601946.0, + "step": 2162 + }, + { + "epoch": 0.27515583259127335, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 1.7891440391540527, + "learning_rate": 9.164900381517592e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8497462868690491, + "num_tokens": 82635994.0, + "step": 2163 + }, + { + "epoch": 0.2752830428698639, + "ewc_loss": 8.52346420288086e-06, + "grad_norm": 1.5498847961425781, + "learning_rate": 9.16913946587537e-07, + "loss": 0.4161, + "mean_token_accuracy": 0.8640993237495422, + "num_tokens": 82676502.0, + "step": 2164 + }, + { + "epoch": 0.2754102531484544, + "ewc_loss": 8.58306884765625e-06, + "grad_norm": 1.7160111665725708, + "learning_rate": 9.17337855023315e-07, + "loss": 0.4674, + "mean_token_accuracy": 0.8463183641433716, + "num_tokens": 82712248.0, + "step": 2165 + }, + { + "epoch": 0.2755374634270449, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.533538818359375, + "learning_rate": 9.177617634590928e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.8489000797271729, + "num_tokens": 82755517.0, + "step": 2166 + }, + { + "epoch": 0.2756646737056354, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.615204095840454, + "learning_rate": 9.181856718948706e-07, + "loss": 0.4061, + "mean_token_accuracy": 0.8631479740142822, + "num_tokens": 82791585.0, + "step": 2167 + }, + { + "epoch": 0.27579188398422594, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.5568931102752686, + "learning_rate": 9.186095803306485e-07, + "loss": 0.4139, + "mean_token_accuracy": 0.8642746806144714, + "num_tokens": 82828703.0, + "step": 2168 + }, + { + "epoch": 0.2759190942628164, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.6319390535354614, + "learning_rate": 9.190334887664264e-07, + "loss": 0.4332, + "mean_token_accuracy": 0.8584396243095398, + "num_tokens": 82865633.0, + "step": 2169 + }, + { + "epoch": 0.27604630454140694, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.5248363018035889, + "learning_rate": 9.194573972022043e-07, + "loss": 0.4053, + "mean_token_accuracy": 0.8663821816444397, + "num_tokens": 82907264.0, + "step": 2170 + }, + { + "epoch": 0.27617351481999747, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.5100685358047485, + "learning_rate": 9.198813056379822e-07, + "loss": 0.4135, + "mean_token_accuracy": 0.8635079860687256, + "num_tokens": 82946856.0, + "step": 2171 + }, + { + "epoch": 0.27630072509858794, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.5546857118606567, + "learning_rate": 9.2030521407376e-07, + "loss": 0.4118, + "mean_token_accuracy": 0.8630770444869995, + "num_tokens": 82983648.0, + "step": 2172 + }, + { + "epoch": 0.27642793537717847, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.693987250328064, + "learning_rate": 9.20729122509538e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8412730097770691, + "num_tokens": 83023247.0, + "step": 2173 + }, + { + "epoch": 0.276555145655769, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.560464859008789, + "learning_rate": 9.211530309453158e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.8609848022460938, + "num_tokens": 83061176.0, + "step": 2174 + }, + { + "epoch": 0.27668235593435947, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.4919949769973755, + "learning_rate": 9.215769393810936e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8554575443267822, + "num_tokens": 83103687.0, + "step": 2175 + }, + { + "epoch": 0.27680956621295, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 2.193011522293091, + "learning_rate": 9.220008478168715e-07, + "loss": 0.4515, + "mean_token_accuracy": 0.8534202575683594, + "num_tokens": 83146766.0, + "step": 2176 + }, + { + "epoch": 0.2769367764915405, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.595570683479309, + "learning_rate": 9.224247562526494e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.8527528047561646, + "num_tokens": 83189695.0, + "step": 2177 + }, + { + "epoch": 0.277063986770131, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.6082265377044678, + "learning_rate": 9.228486646884273e-07, + "loss": 0.389, + "mean_token_accuracy": 0.8702031373977661, + "num_tokens": 83225698.0, + "step": 2178 + }, + { + "epoch": 0.2771911970487215, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.4563112258911133, + "learning_rate": 9.232725731242052e-07, + "loss": 0.4308, + "mean_token_accuracy": 0.8580833077430725, + "num_tokens": 83269037.0, + "step": 2179 + }, + { + "epoch": 0.27731840732731206, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.7060554027557373, + "learning_rate": 9.23696481559983e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8523126840591431, + "num_tokens": 83303783.0, + "step": 2180 + }, + { + "epoch": 0.2774456176059026, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.6942238807678223, + "learning_rate": 9.24120389995761e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.8461191654205322, + "num_tokens": 83338713.0, + "step": 2181 + }, + { + "epoch": 0.27757282788449306, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.5631879568099976, + "learning_rate": 9.245442984315387e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.8472598791122437, + "num_tokens": 83381478.0, + "step": 2182 + }, + { + "epoch": 0.2777000381630836, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.5692201852798462, + "learning_rate": 9.249682068673165e-07, + "loss": 0.44, + "mean_token_accuracy": 0.863906979560852, + "num_tokens": 83419862.0, + "step": 2183 + }, + { + "epoch": 0.2778272484416741, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.5725443363189697, + "learning_rate": 9.253921153030945e-07, + "loss": 0.4205, + "mean_token_accuracy": 0.8588814735412598, + "num_tokens": 83457122.0, + "step": 2184 + }, + { + "epoch": 0.2779544587202646, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.4717446565628052, + "learning_rate": 9.258160237388723e-07, + "loss": 0.3612, + "mean_token_accuracy": 0.877646803855896, + "num_tokens": 83497340.0, + "step": 2185 + }, + { + "epoch": 0.2780816689988551, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.6668263673782349, + "learning_rate": 9.262399321746503e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8396222591400146, + "num_tokens": 83536932.0, + "step": 2186 + }, + { + "epoch": 0.27820887927744564, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.6263636350631714, + "learning_rate": 9.266638406104281e-07, + "loss": 0.4238, + "mean_token_accuracy": 0.8627339601516724, + "num_tokens": 83574840.0, + "step": 2187 + }, + { + "epoch": 0.2783360895560361, + "ewc_loss": 8.64267349243164e-06, + "grad_norm": 1.627371907234192, + "learning_rate": 9.27087749046206e-07, + "loss": 0.4779, + "mean_token_accuracy": 0.8467817306518555, + "num_tokens": 83615522.0, + "step": 2188 + }, + { + "epoch": 0.27846329983462664, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.4474490880966187, + "learning_rate": 9.275116574819839e-07, + "loss": 0.379, + "mean_token_accuracy": 0.8759996891021729, + "num_tokens": 83656109.0, + "step": 2189 + }, + { + "epoch": 0.27859051011321717, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.632722020149231, + "learning_rate": 9.279355659177617e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8497270345687866, + "num_tokens": 83693682.0, + "step": 2190 + }, + { + "epoch": 0.27871772039180764, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.5761396884918213, + "learning_rate": 9.283594743535395e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8551941514015198, + "num_tokens": 83731575.0, + "step": 2191 + }, + { + "epoch": 0.2788449306703982, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.6715686321258545, + "learning_rate": 9.287833827893175e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8493132591247559, + "num_tokens": 83766248.0, + "step": 2192 + }, + { + "epoch": 0.2789721409489887, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.5760670900344849, + "learning_rate": 9.292072912250953e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8512316942214966, + "num_tokens": 83806834.0, + "step": 2193 + }, + { + "epoch": 0.2790993512275792, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.5849730968475342, + "learning_rate": 9.296311996608733e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8461454510688782, + "num_tokens": 83848672.0, + "step": 2194 + }, + { + "epoch": 0.2792265615061697, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.7252544164657593, + "learning_rate": 9.300551080966511e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8449451923370361, + "num_tokens": 83882587.0, + "step": 2195 + }, + { + "epoch": 0.27935377178476023, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.7054460048675537, + "learning_rate": 9.30479016532429e-07, + "loss": 0.4293, + "mean_token_accuracy": 0.8571547269821167, + "num_tokens": 83917777.0, + "step": 2196 + }, + { + "epoch": 0.2794809820633507, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.5296870470046997, + "learning_rate": 9.309029249682068e-07, + "loss": 0.3908, + "mean_token_accuracy": 0.8659257888793945, + "num_tokens": 83956812.0, + "step": 2197 + }, + { + "epoch": 0.27960819234194123, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.684144377708435, + "learning_rate": 9.313268334039847e-07, + "loss": 0.4148, + "mean_token_accuracy": 0.8633272647857666, + "num_tokens": 83993157.0, + "step": 2198 + }, + { + "epoch": 0.27973540262053176, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.5824369192123413, + "learning_rate": 9.317507418397625e-07, + "loss": 0.4503, + "mean_token_accuracy": 0.8539056777954102, + "num_tokens": 84031785.0, + "step": 2199 + }, + { + "epoch": 0.27986261289912223, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.69838547706604, + "learning_rate": 9.321746502755404e-07, + "loss": 0.4773, + "mean_token_accuracy": 0.84690260887146, + "num_tokens": 84066766.0, + "step": 2200 + }, + { + "epoch": 0.27998982317771276, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.757344365119934, + "learning_rate": 9.325985587113183e-07, + "loss": 0.4454, + "mean_token_accuracy": 0.853787362575531, + "num_tokens": 84106549.0, + "step": 2201 + }, + { + "epoch": 0.2801170334563033, + "ewc_loss": 8.702278137207031e-06, + "grad_norm": 1.7104893922805786, + "learning_rate": 9.330224671470962e-07, + "loss": 0.5124, + "mean_token_accuracy": 0.8405764698982239, + "num_tokens": 84144948.0, + "step": 2202 + }, + { + "epoch": 0.28024424373489376, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.8457950353622437, + "learning_rate": 9.334463755828741e-07, + "loss": 0.4783, + "mean_token_accuracy": 0.8456505537033081, + "num_tokens": 84177325.0, + "step": 2203 + }, + { + "epoch": 0.2803714540134843, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.465756893157959, + "learning_rate": 9.338702840186519e-07, + "loss": 0.4056, + "mean_token_accuracy": 0.8647418022155762, + "num_tokens": 84220471.0, + "step": 2204 + }, + { + "epoch": 0.2804986642920748, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6882238388061523, + "learning_rate": 9.342941924544298e-07, + "loss": 0.424, + "mean_token_accuracy": 0.8575825691223145, + "num_tokens": 84256808.0, + "step": 2205 + }, + { + "epoch": 0.2806258745706653, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.5693682432174683, + "learning_rate": 9.347181008902076e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.8548164367675781, + "num_tokens": 84295212.0, + "step": 2206 + }, + { + "epoch": 0.2807530848492558, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.9851328134536743, + "learning_rate": 9.351420093259855e-07, + "loss": 0.4218, + "mean_token_accuracy": 0.8580920696258545, + "num_tokens": 84323498.0, + "step": 2207 + }, + { + "epoch": 0.28088029512784635, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.669316291809082, + "learning_rate": 9.355659177617634e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8597244024276733, + "num_tokens": 84361714.0, + "step": 2208 + }, + { + "epoch": 0.2810075054064368, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.5204346179962158, + "learning_rate": 9.359898261975413e-07, + "loss": 0.4693, + "mean_token_accuracy": 0.8460690975189209, + "num_tokens": 84404611.0, + "step": 2209 + }, + { + "epoch": 0.28113471568502735, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.7112252712249756, + "learning_rate": 9.364137346333192e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8469130992889404, + "num_tokens": 84443178.0, + "step": 2210 + }, + { + "epoch": 0.2812619259636179, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.626531720161438, + "learning_rate": 9.368376430690971e-07, + "loss": 0.4338, + "mean_token_accuracy": 0.857749342918396, + "num_tokens": 84482629.0, + "step": 2211 + }, + { + "epoch": 0.28138913624220835, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.524841547012329, + "learning_rate": 9.372615515048749e-07, + "loss": 0.4088, + "mean_token_accuracy": 0.8648464679718018, + "num_tokens": 84521159.0, + "step": 2212 + }, + { + "epoch": 0.2815163465207989, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6332788467407227, + "learning_rate": 9.376854599406528e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8497527241706848, + "num_tokens": 84558098.0, + "step": 2213 + }, + { + "epoch": 0.2816435567993894, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.7404935359954834, + "learning_rate": 9.381093683764306e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8519511222839355, + "num_tokens": 84593384.0, + "step": 2214 + }, + { + "epoch": 0.2817707670779799, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.5906939506530762, + "learning_rate": 9.385332768122085e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8428336977958679, + "num_tokens": 84636150.0, + "step": 2215 + }, + { + "epoch": 0.2818979773565704, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6730047464370728, + "learning_rate": 9.389571852479864e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8461509943008423, + "num_tokens": 84673468.0, + "step": 2216 + }, + { + "epoch": 0.28202518763516093, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.7347402572631836, + "learning_rate": 9.393810936837643e-07, + "loss": 0.4445, + "mean_token_accuracy": 0.8515567779541016, + "num_tokens": 84708867.0, + "step": 2217 + }, + { + "epoch": 0.2821523979137514, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6550953388214111, + "learning_rate": 9.398050021195422e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.853977620601654, + "num_tokens": 84742059.0, + "step": 2218 + }, + { + "epoch": 0.28227960819234194, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6606916189193726, + "learning_rate": 9.402289105553201e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.8487890958786011, + "num_tokens": 84780439.0, + "step": 2219 + }, + { + "epoch": 0.28240681847093246, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.614367961883545, + "learning_rate": 9.406528189910978e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8459833860397339, + "num_tokens": 84819931.0, + "step": 2220 + }, + { + "epoch": 0.28253402874952294, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6581573486328125, + "learning_rate": 9.410767274268757e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8509478569030762, + "num_tokens": 84855549.0, + "step": 2221 + }, + { + "epoch": 0.28266123902811346, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6534197330474854, + "learning_rate": 9.415006358626536e-07, + "loss": 0.4364, + "mean_token_accuracy": 0.8557701110839844, + "num_tokens": 84890987.0, + "step": 2222 + }, + { + "epoch": 0.282788449306704, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6060893535614014, + "learning_rate": 9.419245442984314e-07, + "loss": 0.3996, + "mean_token_accuracy": 0.8677910566329956, + "num_tokens": 84928664.0, + "step": 2223 + }, + { + "epoch": 0.28291565958529447, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.747588872909546, + "learning_rate": 9.423484527342094e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.846582293510437, + "num_tokens": 84961802.0, + "step": 2224 + }, + { + "epoch": 0.283042869863885, + "ewc_loss": 8.761882781982422e-06, + "grad_norm": 1.6045950651168823, + "learning_rate": 9.427723611699872e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8448929786682129, + "num_tokens": 85001075.0, + "step": 2225 + }, + { + "epoch": 0.2831700801424755, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5277612209320068, + "learning_rate": 9.431962696057652e-07, + "loss": 0.4215, + "mean_token_accuracy": 0.861937940120697, + "num_tokens": 85044173.0, + "step": 2226 + }, + { + "epoch": 0.283297290421066, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.6849274635314941, + "learning_rate": 9.43620178041543e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.8486076593399048, + "num_tokens": 85083220.0, + "step": 2227 + }, + { + "epoch": 0.2834245006996565, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 4.691128730773926, + "learning_rate": 9.440440864773208e-07, + "loss": 0.4166, + "mean_token_accuracy": 0.860586404800415, + "num_tokens": 85118082.0, + "step": 2228 + }, + { + "epoch": 0.28355171097824705, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5440635681152344, + "learning_rate": 9.444679949130987e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8547682762145996, + "num_tokens": 85158267.0, + "step": 2229 + }, + { + "epoch": 0.2836789212568376, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.665216326713562, + "learning_rate": 9.448919033488766e-07, + "loss": 0.3871, + "mean_token_accuracy": 0.8669303059577942, + "num_tokens": 85190587.0, + "step": 2230 + }, + { + "epoch": 0.28380613153542805, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.567147135734558, + "learning_rate": 9.453158117846544e-07, + "loss": 0.4216, + "mean_token_accuracy": 0.8573691844940186, + "num_tokens": 85227683.0, + "step": 2231 + }, + { + "epoch": 0.2839333418140186, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.6423063278198242, + "learning_rate": 9.457397202204324e-07, + "loss": 0.4042, + "mean_token_accuracy": 0.8650147318840027, + "num_tokens": 85263174.0, + "step": 2232 + }, + { + "epoch": 0.2840605520926091, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.6677253246307373, + "learning_rate": 9.461636286562102e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8380616903305054, + "num_tokens": 85300954.0, + "step": 2233 + }, + { + "epoch": 0.2841877623711996, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.6168104410171509, + "learning_rate": 9.465875370919882e-07, + "loss": 0.4768, + "mean_token_accuracy": 0.8470097780227661, + "num_tokens": 85339246.0, + "step": 2234 + }, + { + "epoch": 0.2843149726497901, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5716004371643066, + "learning_rate": 9.470114455277659e-07, + "loss": 0.4406, + "mean_token_accuracy": 0.8575483560562134, + "num_tokens": 85381118.0, + "step": 2235 + }, + { + "epoch": 0.28444218292838064, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5548996925354004, + "learning_rate": 9.474353539635438e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8431409597396851, + "num_tokens": 85427722.0, + "step": 2236 + }, + { + "epoch": 0.2845693932069711, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.7320687770843506, + "learning_rate": 9.478592623993217e-07, + "loss": 0.4802, + "mean_token_accuracy": 0.8432056307792664, + "num_tokens": 85465918.0, + "step": 2237 + }, + { + "epoch": 0.28469660348556164, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.6620417833328247, + "learning_rate": 9.482831708350996e-07, + "loss": 0.46, + "mean_token_accuracy": 0.847193717956543, + "num_tokens": 85507021.0, + "step": 2238 + }, + { + "epoch": 0.28482381376415217, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.7886098623275757, + "learning_rate": 9.487070792708775e-07, + "loss": 0.4366, + "mean_token_accuracy": 0.8535423874855042, + "num_tokens": 85539838.0, + "step": 2239 + }, + { + "epoch": 0.28495102404274264, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5233060121536255, + "learning_rate": 9.491309877066554e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.858367383480072, + "num_tokens": 85581185.0, + "step": 2240 + }, + { + "epoch": 0.28507823432133317, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5851205587387085, + "learning_rate": 9.495548961424332e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8468567132949829, + "num_tokens": 85622494.0, + "step": 2241 + }, + { + "epoch": 0.2852054445999237, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.4930598735809326, + "learning_rate": 9.499788045782111e-07, + "loss": 0.4248, + "mean_token_accuracy": 0.8589495420455933, + "num_tokens": 85666360.0, + "step": 2242 + }, + { + "epoch": 0.28533265487851417, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.7090855836868286, + "learning_rate": 9.504027130139889e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8413278460502625, + "num_tokens": 85700390.0, + "step": 2243 + }, + { + "epoch": 0.2854598651571047, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.730474829673767, + "learning_rate": 9.508266214497667e-07, + "loss": 0.4787, + "mean_token_accuracy": 0.8414472341537476, + "num_tokens": 85736915.0, + "step": 2244 + }, + { + "epoch": 0.2855870754356952, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.579228401184082, + "learning_rate": 9.512505298855447e-07, + "loss": 0.4112, + "mean_token_accuracy": 0.86629319190979, + "num_tokens": 85776430.0, + "step": 2245 + }, + { + "epoch": 0.2857142857142857, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.6722246408462524, + "learning_rate": 9.516744383213225e-07, + "loss": 0.4699, + "mean_token_accuracy": 0.8439193964004517, + "num_tokens": 85820700.0, + "step": 2246 + }, + { + "epoch": 0.2858414959928762, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.6456130743026733, + "learning_rate": 9.520983467571005e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.8542084097862244, + "num_tokens": 85857954.0, + "step": 2247 + }, + { + "epoch": 0.28596870627146675, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.7669641971588135, + "learning_rate": 9.525222551928783e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8351398706436157, + "num_tokens": 85892297.0, + "step": 2248 + }, + { + "epoch": 0.2860959165500572, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5101155042648315, + "learning_rate": 9.529461636286562e-07, + "loss": 0.4378, + "mean_token_accuracy": 0.8540381789207458, + "num_tokens": 85933624.0, + "step": 2249 + }, + { + "epoch": 0.28622312682864776, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5502450466156006, + "learning_rate": 9.533700720644341e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8514971733093262, + "num_tokens": 85973782.0, + "step": 2250 + }, + { + "epoch": 0.2863503371072383, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.8193533420562744, + "learning_rate": 9.537939805002118e-07, + "loss": 0.478, + "mean_token_accuracy": 0.844399094581604, + "num_tokens": 86004174.0, + "step": 2251 + }, + { + "epoch": 0.28647754738582876, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.7628874778747559, + "learning_rate": 9.542178889359898e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8514840602874756, + "num_tokens": 86039888.0, + "step": 2252 + }, + { + "epoch": 0.2866047576644193, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.669701099395752, + "learning_rate": 9.546417973717677e-07, + "loss": 0.4122, + "mean_token_accuracy": 0.864849328994751, + "num_tokens": 86076642.0, + "step": 2253 + }, + { + "epoch": 0.2867319679430098, + "ewc_loss": 8.821487426757812e-06, + "grad_norm": 1.5693806409835815, + "learning_rate": 9.550657058075455e-07, + "loss": 0.421, + "mean_token_accuracy": 0.8633815050125122, + "num_tokens": 86116223.0, + "step": 2254 + }, + { + "epoch": 0.2868591782216003, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 1.5844839811325073, + "learning_rate": 9.554896142433234e-07, + "loss": 0.4344, + "mean_token_accuracy": 0.8550021648406982, + "num_tokens": 86157018.0, + "step": 2255 + }, + { + "epoch": 0.2869863885001908, + "ewc_loss": 8.881092071533203e-06, + "grad_norm": 1.5474727153778076, + "learning_rate": 9.559135226791012e-07, + "loss": 0.4167, + "mean_token_accuracy": 0.8624328970909119, + "num_tokens": 86195321.0, + "step": 2256 + }, + { + "epoch": 0.28711359877878134, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.540008544921875, + "learning_rate": 9.563374311148793e-07, + "loss": 0.3842, + "mean_token_accuracy": 0.8716729283332825, + "num_tokens": 86231476.0, + "step": 2257 + }, + { + "epoch": 0.2872408090573718, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.6103872060775757, + "learning_rate": 9.56761339550657e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8453240394592285, + "num_tokens": 86269622.0, + "step": 2258 + }, + { + "epoch": 0.28736801933596234, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.5775984525680542, + "learning_rate": 9.57185247986435e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8518866300582886, + "num_tokens": 86313099.0, + "step": 2259 + }, + { + "epoch": 0.28749522961455287, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.7955563068389893, + "learning_rate": 9.576091564222128e-07, + "loss": 0.4501, + "mean_token_accuracy": 0.8490613102912903, + "num_tokens": 86345606.0, + "step": 2260 + }, + { + "epoch": 0.28762243989314334, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.8997342586517334, + "learning_rate": 9.580330648579906e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8362910747528076, + "num_tokens": 86383371.0, + "step": 2261 + }, + { + "epoch": 0.2877496501717339, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.6565455198287964, + "learning_rate": 9.584569732937685e-07, + "loss": 0.4328, + "mean_token_accuracy": 0.8596601486206055, + "num_tokens": 86420650.0, + "step": 2262 + }, + { + "epoch": 0.2878768604503244, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.6535388231277466, + "learning_rate": 9.588808817295463e-07, + "loss": 0.4345, + "mean_token_accuracy": 0.8547702431678772, + "num_tokens": 86455974.0, + "step": 2263 + }, + { + "epoch": 0.2880040707289149, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.5983952283859253, + "learning_rate": 9.593047901653242e-07, + "loss": 0.4542, + "mean_token_accuracy": 0.850347101688385, + "num_tokens": 86495498.0, + "step": 2264 + }, + { + "epoch": 0.2881312810075054, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.5794119834899902, + "learning_rate": 9.597286986011022e-07, + "loss": 0.4402, + "mean_token_accuracy": 0.8542965650558472, + "num_tokens": 86536064.0, + "step": 2265 + }, + { + "epoch": 0.28825849128609593, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.6349557638168335, + "learning_rate": 9.601526070368799e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8497076034545898, + "num_tokens": 86581665.0, + "step": 2266 + }, + { + "epoch": 0.2883857015646864, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.5796173810958862, + "learning_rate": 9.60576515472658e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8630735874176025, + "num_tokens": 86625610.0, + "step": 2267 + }, + { + "epoch": 0.28851291184327693, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.6132636070251465, + "learning_rate": 9.610004239084358e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8408060669898987, + "num_tokens": 86664418.0, + "step": 2268 + }, + { + "epoch": 0.28864012212186746, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.6130249500274658, + "learning_rate": 9.614243323442136e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8384860754013062, + "num_tokens": 86703009.0, + "step": 2269 + }, + { + "epoch": 0.28876733240045793, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.802622675895691, + "learning_rate": 9.618482407799915e-07, + "loss": 0.4419, + "mean_token_accuracy": 0.8545242547988892, + "num_tokens": 86737176.0, + "step": 2270 + }, + { + "epoch": 0.28889454267904846, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.6024142503738403, + "learning_rate": 9.622721492157693e-07, + "loss": 0.4418, + "mean_token_accuracy": 0.8565118312835693, + "num_tokens": 86776750.0, + "step": 2271 + }, + { + "epoch": 0.289021752957639, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.6688640117645264, + "learning_rate": 9.626960576515472e-07, + "loss": 0.4449, + "mean_token_accuracy": 0.8547078371047974, + "num_tokens": 86816244.0, + "step": 2272 + }, + { + "epoch": 0.28914896323622946, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.5337116718292236, + "learning_rate": 9.63119966087325e-07, + "loss": 0.3951, + "mean_token_accuracy": 0.8686423301696777, + "num_tokens": 86853472.0, + "step": 2273 + }, + { + "epoch": 0.28927617351482, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.593299388885498, + "learning_rate": 9.635438745231029e-07, + "loss": 0.411, + "mean_token_accuracy": 0.8634998202323914, + "num_tokens": 86892001.0, + "step": 2274 + }, + { + "epoch": 0.2894033837934105, + "ewc_loss": 8.940696716308594e-06, + "grad_norm": 1.7844667434692383, + "learning_rate": 9.63967782958881e-07, + "loss": 0.4071, + "mean_token_accuracy": 0.8648931384086609, + "num_tokens": 86926062.0, + "step": 2275 + }, + { + "epoch": 0.289530594072001, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 1.5384557247161865, + "learning_rate": 9.643916913946588e-07, + "loss": 0.4216, + "mean_token_accuracy": 0.8631683588027954, + "num_tokens": 86969254.0, + "step": 2276 + }, + { + "epoch": 0.2896578043505915, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 1.6522685289382935, + "learning_rate": 9.648155998304366e-07, + "loss": 0.4463, + "mean_token_accuracy": 0.8513191938400269, + "num_tokens": 87007489.0, + "step": 2277 + }, + { + "epoch": 0.28978501462918205, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 1.7889844179153442, + "learning_rate": 9.652395082662145e-07, + "loss": 0.4366, + "mean_token_accuracy": 0.8552199602127075, + "num_tokens": 87043267.0, + "step": 2278 + }, + { + "epoch": 0.2899122249077726, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 1.6357415914535522, + "learning_rate": 9.656634167019923e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8456206917762756, + "num_tokens": 87081747.0, + "step": 2279 + }, + { + "epoch": 0.29003943518636305, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 1.73231840133667, + "learning_rate": 9.660873251377701e-07, + "loss": 0.4484, + "mean_token_accuracy": 0.8534804582595825, + "num_tokens": 87118681.0, + "step": 2280 + }, + { + "epoch": 0.2901666454649536, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 1.7243720293045044, + "learning_rate": 9.66511233573548e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8417916893959045, + "num_tokens": 87151800.0, + "step": 2281 + }, + { + "epoch": 0.2902938557435441, + "ewc_loss": 9.059906005859375e-06, + "grad_norm": 1.6991362571716309, + "learning_rate": 9.669351420093258e-07, + "loss": 0.5271, + "mean_token_accuracy": 0.8294157385826111, + "num_tokens": 87190826.0, + "step": 2282 + }, + { + "epoch": 0.2904210660221346, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.6233971118927002, + "learning_rate": 9.67359050445104e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8365262746810913, + "num_tokens": 87232514.0, + "step": 2283 + }, + { + "epoch": 0.2905482763007251, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.6307963132858276, + "learning_rate": 9.677829588808817e-07, + "loss": 0.4584, + "mean_token_accuracy": 0.8486170768737793, + "num_tokens": 87274035.0, + "step": 2284 + }, + { + "epoch": 0.29067548657931563, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.688401222229004, + "learning_rate": 9.682068673166596e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8514401912689209, + "num_tokens": 87313101.0, + "step": 2285 + }, + { + "epoch": 0.2908026968579061, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.52302086353302, + "learning_rate": 9.686307757524374e-07, + "loss": 0.4201, + "mean_token_accuracy": 0.8608346581459045, + "num_tokens": 87352573.0, + "step": 2286 + }, + { + "epoch": 0.29092990713649663, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.6943109035491943, + "learning_rate": 9.690546841882153e-07, + "loss": 0.4263, + "mean_token_accuracy": 0.8601601123809814, + "num_tokens": 87385863.0, + "step": 2287 + }, + { + "epoch": 0.29105711741508716, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.539291501045227, + "learning_rate": 9.694785926239931e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8522735834121704, + "num_tokens": 87425331.0, + "step": 2288 + }, + { + "epoch": 0.29118432769367764, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.8400418758392334, + "learning_rate": 9.69902501059771e-07, + "loss": 0.4859, + "mean_token_accuracy": 0.8401345014572144, + "num_tokens": 87457995.0, + "step": 2289 + }, + { + "epoch": 0.29131153797226816, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.552889347076416, + "learning_rate": 9.703264094955488e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8417925238609314, + "num_tokens": 87499673.0, + "step": 2290 + }, + { + "epoch": 0.2914387482508587, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.5642821788787842, + "learning_rate": 9.707503179313269e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.8506370782852173, + "num_tokens": 87539911.0, + "step": 2291 + }, + { + "epoch": 0.29156595852944917, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.5828361511230469, + "learning_rate": 9.711742263671047e-07, + "loss": 0.4209, + "mean_token_accuracy": 0.859731137752533, + "num_tokens": 87575964.0, + "step": 2292 + }, + { + "epoch": 0.2916931688080397, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.726916790008545, + "learning_rate": 9.715981348028826e-07, + "loss": 0.4494, + "mean_token_accuracy": 0.8541022539138794, + "num_tokens": 87613617.0, + "step": 2293 + }, + { + "epoch": 0.2918203790866302, + "ewc_loss": 9.119510650634766e-06, + "grad_norm": 1.7151638269424438, + "learning_rate": 9.720220432386604e-07, + "loss": 0.4134, + "mean_token_accuracy": 0.8643237948417664, + "num_tokens": 87649075.0, + "step": 2294 + }, + { + "epoch": 0.2919475893652207, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 1.6649318933486938, + "learning_rate": 9.724459516744383e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8511902689933777, + "num_tokens": 87688014.0, + "step": 2295 + }, + { + "epoch": 0.2920747996438112, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 1.7351330518722534, + "learning_rate": 9.728698601102161e-07, + "loss": 0.4424, + "mean_token_accuracy": 0.8555781841278076, + "num_tokens": 87720574.0, + "step": 2296 + }, + { + "epoch": 0.29220200992240175, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 1.6730678081512451, + "learning_rate": 9.73293768545994e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.843325138092041, + "num_tokens": 87760233.0, + "step": 2297 + }, + { + "epoch": 0.2923292202009922, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 1.6704529523849487, + "learning_rate": 9.737176769817718e-07, + "loss": 0.4182, + "mean_token_accuracy": 0.8656342029571533, + "num_tokens": 87794814.0, + "step": 2298 + }, + { + "epoch": 0.29245643047958275, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 1.6065841913223267, + "learning_rate": 9.741415854175499e-07, + "loss": 0.3767, + "mean_token_accuracy": 0.8767256736755371, + "num_tokens": 87829676.0, + "step": 2299 + }, + { + "epoch": 0.2925836407581733, + "ewc_loss": 9.179115295410156e-06, + "grad_norm": 1.576392650604248, + "learning_rate": 9.745654938533277e-07, + "loss": 0.4525, + "mean_token_accuracy": 0.8520981073379517, + "num_tokens": 87869570.0, + "step": 2300 + }, + { + "epoch": 0.29271085103676375, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.6226760149002075, + "learning_rate": 9.749894022891056e-07, + "loss": 0.4171, + "mean_token_accuracy": 0.8618802428245544, + "num_tokens": 87906970.0, + "step": 2301 + }, + { + "epoch": 0.2928380613153543, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.599035382270813, + "learning_rate": 9.754133107248834e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8485186696052551, + "num_tokens": 87948410.0, + "step": 2302 + }, + { + "epoch": 0.2929652715939448, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.7330925464630127, + "learning_rate": 9.758372191606612e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.8427765965461731, + "num_tokens": 87988687.0, + "step": 2303 + }, + { + "epoch": 0.2930924818725353, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.974710464477539, + "learning_rate": 9.76261127596439e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8423832654953003, + "num_tokens": 88018601.0, + "step": 2304 + }, + { + "epoch": 0.2932196921511258, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.6053422689437866, + "learning_rate": 9.76685036032217e-07, + "loss": 0.4329, + "mean_token_accuracy": 0.8618743419647217, + "num_tokens": 88058390.0, + "step": 2305 + }, + { + "epoch": 0.29334690242971634, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.737476110458374, + "learning_rate": 9.771089444679948e-07, + "loss": 0.4385, + "mean_token_accuracy": 0.8541103601455688, + "num_tokens": 88095948.0, + "step": 2306 + }, + { + "epoch": 0.2934741127083068, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.7433627843856812, + "learning_rate": 9.775328529037728e-07, + "loss": 0.4245, + "mean_token_accuracy": 0.8555582165718079, + "num_tokens": 88131715.0, + "step": 2307 + }, + { + "epoch": 0.29360132298689734, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.5579365491867065, + "learning_rate": 9.779567613395507e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8490164279937744, + "num_tokens": 88171190.0, + "step": 2308 + }, + { + "epoch": 0.29372853326548787, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.5750123262405396, + "learning_rate": 9.783806697753285e-07, + "loss": 0.3832, + "mean_token_accuracy": 0.8724466562271118, + "num_tokens": 88209065.0, + "step": 2309 + }, + { + "epoch": 0.29385574354407834, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.6835120916366577, + "learning_rate": 9.788045782111064e-07, + "loss": 0.4445, + "mean_token_accuracy": 0.8551928400993347, + "num_tokens": 88249080.0, + "step": 2310 + }, + { + "epoch": 0.29398295382266887, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.7146238088607788, + "learning_rate": 9.792284866468842e-07, + "loss": 0.479, + "mean_token_accuracy": 0.841267466545105, + "num_tokens": 88288717.0, + "step": 2311 + }, + { + "epoch": 0.2941101641012594, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.8866499662399292, + "learning_rate": 9.79652395082662e-07, + "loss": 0.495, + "mean_token_accuracy": 0.8364197611808777, + "num_tokens": 88320797.0, + "step": 2312 + }, + { + "epoch": 0.29423737437984987, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.6468604803085327, + "learning_rate": 9.8007630351844e-07, + "loss": 0.3898, + "mean_token_accuracy": 0.8702942728996277, + "num_tokens": 88358752.0, + "step": 2313 + }, + { + "epoch": 0.2943645846584404, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.5747936964035034, + "learning_rate": 9.805002119542178e-07, + "loss": 0.451, + "mean_token_accuracy": 0.852393627166748, + "num_tokens": 88398749.0, + "step": 2314 + }, + { + "epoch": 0.2944917949370309, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.6172850131988525, + "learning_rate": 9.809241203899958e-07, + "loss": 0.4175, + "mean_token_accuracy": 0.8604791164398193, + "num_tokens": 88437685.0, + "step": 2315 + }, + { + "epoch": 0.2946190052156214, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.6512200832366943, + "learning_rate": 9.813480288257737e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.8500515222549438, + "num_tokens": 88475970.0, + "step": 2316 + }, + { + "epoch": 0.2947462154942119, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.5706045627593994, + "learning_rate": 9.817719372615515e-07, + "loss": 0.3747, + "mean_token_accuracy": 0.875332236289978, + "num_tokens": 88514834.0, + "step": 2317 + }, + { + "epoch": 0.29487342577280246, + "ewc_loss": 9.238719940185547e-06, + "grad_norm": 1.4814831018447876, + "learning_rate": 9.821958456973294e-07, + "loss": 0.4303, + "mean_token_accuracy": 0.857999324798584, + "num_tokens": 88560547.0, + "step": 2318 + }, + { + "epoch": 0.29500063605139293, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.693367600440979, + "learning_rate": 9.826197541331072e-07, + "loss": 0.3966, + "mean_token_accuracy": 0.8676677942276001, + "num_tokens": 88595980.0, + "step": 2319 + }, + { + "epoch": 0.29512784632998346, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.6386417150497437, + "learning_rate": 9.83043662568885e-07, + "loss": 0.4408, + "mean_token_accuracy": 0.8549449443817139, + "num_tokens": 88631738.0, + "step": 2320 + }, + { + "epoch": 0.295255056608574, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.4514371156692505, + "learning_rate": 9.83467571004663e-07, + "loss": 0.4349, + "mean_token_accuracy": 0.8577936887741089, + "num_tokens": 88677960.0, + "step": 2321 + }, + { + "epoch": 0.29538226688716446, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.582306981086731, + "learning_rate": 9.838914794404407e-07, + "loss": 0.4244, + "mean_token_accuracy": 0.8577306270599365, + "num_tokens": 88717270.0, + "step": 2322 + }, + { + "epoch": 0.295509477165755, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.6686077117919922, + "learning_rate": 9.843153878762188e-07, + "loss": 0.4079, + "mean_token_accuracy": 0.8654924035072327, + "num_tokens": 88757329.0, + "step": 2323 + }, + { + "epoch": 0.2956366874443455, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.5624579191207886, + "learning_rate": 9.847392963119966e-07, + "loss": 0.4221, + "mean_token_accuracy": 0.8593029975891113, + "num_tokens": 88796631.0, + "step": 2324 + }, + { + "epoch": 0.295763897722936, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.663683533668518, + "learning_rate": 9.851632047477745e-07, + "loss": 0.4276, + "mean_token_accuracy": 0.8577146530151367, + "num_tokens": 88833983.0, + "step": 2325 + }, + { + "epoch": 0.2958911080015265, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.6179044246673584, + "learning_rate": 9.855871131835523e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8469487428665161, + "num_tokens": 88872959.0, + "step": 2326 + }, + { + "epoch": 0.29601831828011704, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.6340522766113281, + "learning_rate": 9.860110216193302e-07, + "loss": 0.4416, + "mean_token_accuracy": 0.8517346382141113, + "num_tokens": 88909387.0, + "step": 2327 + }, + { + "epoch": 0.2961455285587075, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.6247761249542236, + "learning_rate": 9.86434930055108e-07, + "loss": 0.4199, + "mean_token_accuracy": 0.8592575788497925, + "num_tokens": 88946640.0, + "step": 2328 + }, + { + "epoch": 0.29627273883729804, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.6929746866226196, + "learning_rate": 9.868588384908859e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8593775033950806, + "num_tokens": 88983481.0, + "step": 2329 + }, + { + "epoch": 0.2963999491158886, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.528760313987732, + "learning_rate": 9.872827469266637e-07, + "loss": 0.434, + "mean_token_accuracy": 0.8567321300506592, + "num_tokens": 89027061.0, + "step": 2330 + }, + { + "epoch": 0.2965271593944791, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.6344549655914307, + "learning_rate": 9.877066553624418e-07, + "loss": 0.4051, + "mean_token_accuracy": 0.867556631565094, + "num_tokens": 89061925.0, + "step": 2331 + }, + { + "epoch": 0.2966543696730696, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.4836667776107788, + "learning_rate": 9.881305637982196e-07, + "loss": 0.4258, + "mean_token_accuracy": 0.861376166343689, + "num_tokens": 89102912.0, + "step": 2332 + }, + { + "epoch": 0.2967815799516601, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.588265061378479, + "learning_rate": 9.885544722339975e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8473892211914062, + "num_tokens": 89142790.0, + "step": 2333 + }, + { + "epoch": 0.29690879023025063, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.6727327108383179, + "learning_rate": 9.889783806697753e-07, + "loss": 0.4421, + "mean_token_accuracy": 0.8562774062156677, + "num_tokens": 89180465.0, + "step": 2334 + }, + { + "epoch": 0.2970360005088411, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.659395456314087, + "learning_rate": 9.894022891055532e-07, + "loss": 0.4354, + "mean_token_accuracy": 0.8538872003555298, + "num_tokens": 89217921.0, + "step": 2335 + }, + { + "epoch": 0.29716321078743163, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.6737778186798096, + "learning_rate": 9.89826197541331e-07, + "loss": 0.4361, + "mean_token_accuracy": 0.8583962917327881, + "num_tokens": 89255655.0, + "step": 2336 + }, + { + "epoch": 0.29729042106602216, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.585840106010437, + "learning_rate": 9.902501059771089e-07, + "loss": 0.4078, + "mean_token_accuracy": 0.8668193817138672, + "num_tokens": 89299040.0, + "step": 2337 + }, + { + "epoch": 0.29741763134461263, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.5527896881103516, + "learning_rate": 9.906740144128867e-07, + "loss": 0.3955, + "mean_token_accuracy": 0.8654752373695374, + "num_tokens": 89335226.0, + "step": 2338 + }, + { + "epoch": 0.29754484162320316, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.5310673713684082, + "learning_rate": 9.910979228486648e-07, + "loss": 0.4084, + "mean_token_accuracy": 0.8630966544151306, + "num_tokens": 89373102.0, + "step": 2339 + }, + { + "epoch": 0.2976720519017937, + "ewc_loss": 9.298324584960938e-06, + "grad_norm": 1.6944717168807983, + "learning_rate": 9.915218312844426e-07, + "loss": 0.4706, + "mean_token_accuracy": 0.844460129737854, + "num_tokens": 89412487.0, + "step": 2340 + }, + { + "epoch": 0.29779926218038416, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.5281879901885986, + "learning_rate": 9.919457397202205e-07, + "loss": 0.4322, + "mean_token_accuracy": 0.8571373224258423, + "num_tokens": 89453394.0, + "step": 2341 + }, + { + "epoch": 0.2979264724589747, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.5057847499847412, + "learning_rate": 9.923696481559983e-07, + "loss": 0.4288, + "mean_token_accuracy": 0.8596645593643188, + "num_tokens": 89494469.0, + "step": 2342 + }, + { + "epoch": 0.2980536827375652, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.7238396406173706, + "learning_rate": 9.927935565917761e-07, + "loss": 0.4216, + "mean_token_accuracy": 0.8580604791641235, + "num_tokens": 89525745.0, + "step": 2343 + }, + { + "epoch": 0.2981808930161557, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.6977065801620483, + "learning_rate": 9.93217465027554e-07, + "loss": 0.4062, + "mean_token_accuracy": 0.86650550365448, + "num_tokens": 89562013.0, + "step": 2344 + }, + { + "epoch": 0.2983081032947462, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.879386067390442, + "learning_rate": 9.936413734633318e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8509423732757568, + "num_tokens": 89597676.0, + "step": 2345 + }, + { + "epoch": 0.29843531357333675, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.7242828607559204, + "learning_rate": 9.940652818991097e-07, + "loss": 0.461, + "mean_token_accuracy": 0.8481331467628479, + "num_tokens": 89635703.0, + "step": 2346 + }, + { + "epoch": 0.2985625238519272, + "ewc_loss": 9.357929229736328e-06, + "grad_norm": 1.674790382385254, + "learning_rate": 9.944891903348877e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8511884212493896, + "num_tokens": 89671942.0, + "step": 2347 + }, + { + "epoch": 0.29868973413051775, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 1.6753910779953003, + "learning_rate": 9.949130987706656e-07, + "loss": 0.4429, + "mean_token_accuracy": 0.8515928983688354, + "num_tokens": 89710878.0, + "step": 2348 + }, + { + "epoch": 0.2988169444091083, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 1.5699024200439453, + "learning_rate": 9.953370072064432e-07, + "loss": 0.43, + "mean_token_accuracy": 0.8556080460548401, + "num_tokens": 89750149.0, + "step": 2349 + }, + { + "epoch": 0.29894415468769875, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 1.7738063335418701, + "learning_rate": 9.957609156422213e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8466379642486572, + "num_tokens": 89784212.0, + "step": 2350 + }, + { + "epoch": 0.2990713649662893, + "ewc_loss": 9.417533874511719e-06, + "grad_norm": 1.564306378364563, + "learning_rate": 9.961848240779991e-07, + "loss": 0.4279, + "mean_token_accuracy": 0.8570088148117065, + "num_tokens": 89821147.0, + "step": 2351 + }, + { + "epoch": 0.2991985752448798, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.656413197517395, + "learning_rate": 9.96608732513777e-07, + "loss": 0.4394, + "mean_token_accuracy": 0.8573288321495056, + "num_tokens": 89861014.0, + "step": 2352 + }, + { + "epoch": 0.2993257855234703, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.6427327394485474, + "learning_rate": 9.970326409495548e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8495404124259949, + "num_tokens": 89902095.0, + "step": 2353 + }, + { + "epoch": 0.2994529958020608, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.7800081968307495, + "learning_rate": 9.974565493853327e-07, + "loss": 0.4229, + "mean_token_accuracy": 0.8571457862854004, + "num_tokens": 89935506.0, + "step": 2354 + }, + { + "epoch": 0.29958020608065133, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.5642704963684082, + "learning_rate": 9.978804578211107e-07, + "loss": 0.4056, + "mean_token_accuracy": 0.8681110143661499, + "num_tokens": 89973084.0, + "step": 2355 + }, + { + "epoch": 0.2997074163592418, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.6069415807724, + "learning_rate": 9.983043662568886e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.847275972366333, + "num_tokens": 90010734.0, + "step": 2356 + }, + { + "epoch": 0.29983462663783234, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.9411273002624512, + "learning_rate": 9.987282746926662e-07, + "loss": 0.5054, + "mean_token_accuracy": 0.8284528255462646, + "num_tokens": 90040821.0, + "step": 2357 + }, + { + "epoch": 0.29996183691642286, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.7751561403274536, + "learning_rate": 9.991521831284443e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8536629676818848, + "num_tokens": 90080037.0, + "step": 2358 + }, + { + "epoch": 0.30008904719501334, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.7750000953674316, + "learning_rate": 9.995760915642221e-07, + "loss": 0.4476, + "mean_token_accuracy": 0.8544917106628418, + "num_tokens": 90115130.0, + "step": 2359 + }, + { + "epoch": 0.30021625747360386, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.5873184204101562, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8578554391860962, + "num_tokens": 90156040.0, + "step": 2360 + }, + { + "epoch": 0.3003434677521944, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.611628532409668, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8449178338050842, + "num_tokens": 90198157.0, + "step": 2361 + }, + { + "epoch": 0.30047067803078487, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.5152487754821777, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8485859036445618, + "num_tokens": 90240023.0, + "step": 2362 + }, + { + "epoch": 0.3005978883093754, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.7165119647979736, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8595971465110779, + "num_tokens": 90275177.0, + "step": 2363 + }, + { + "epoch": 0.3007250985879659, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.4917536973953247, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8647035360336304, + "num_tokens": 90316935.0, + "step": 2364 + }, + { + "epoch": 0.3008523088665564, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.611380696296692, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8719836473464966, + "num_tokens": 90356983.0, + "step": 2365 + }, + { + "epoch": 0.3009795191451469, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.5409934520721436, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8552997708320618, + "num_tokens": 90396202.0, + "step": 2366 + }, + { + "epoch": 0.30110672942373745, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.6068551540374756, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8517508506774902, + "num_tokens": 90433982.0, + "step": 2367 + }, + { + "epoch": 0.3012339397023279, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.8304407596588135, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8438240885734558, + "num_tokens": 90468447.0, + "step": 2368 + }, + { + "epoch": 0.30136114998091845, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.68513023853302, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8547936677932739, + "num_tokens": 90501875.0, + "step": 2369 + }, + { + "epoch": 0.301488360259509, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.6407946348190308, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8660731315612793, + "num_tokens": 90538323.0, + "step": 2370 + }, + { + "epoch": 0.30161557053809945, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.6690959930419922, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.873163104057312, + "num_tokens": 90574523.0, + "step": 2371 + }, + { + "epoch": 0.30174278081669, + "ewc_loss": 9.47713851928711e-06, + "grad_norm": 1.5525375604629517, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8454824090003967, + "num_tokens": 90613912.0, + "step": 2372 + }, + { + "epoch": 0.3018699910952805, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 1.5354610681533813, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8489893674850464, + "num_tokens": 90659302.0, + "step": 2373 + }, + { + "epoch": 0.301997201373871, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 1.642843246459961, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8460111618041992, + "num_tokens": 90695395.0, + "step": 2374 + }, + { + "epoch": 0.3021244116524615, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 1.4674549102783203, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8728827238082886, + "num_tokens": 90732836.0, + "step": 2375 + }, + { + "epoch": 0.30225162193105204, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 1.4713667631149292, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8432990312576294, + "num_tokens": 90781688.0, + "step": 2376 + }, + { + "epoch": 0.3023788322096425, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 1.6047836542129517, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.858172595500946, + "num_tokens": 90818902.0, + "step": 2377 + }, + { + "epoch": 0.30250604248823304, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 1.4594271183013916, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8669965267181396, + "num_tokens": 90860606.0, + "step": 2378 + }, + { + "epoch": 0.30263325276682357, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 1.4887416362762451, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8709077835083008, + "num_tokens": 90901441.0, + "step": 2379 + }, + { + "epoch": 0.3027604630454141, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 1.7305736541748047, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8487502336502075, + "num_tokens": 90937416.0, + "step": 2380 + }, + { + "epoch": 0.30288767332400457, + "ewc_loss": 9.59634780883789e-06, + "grad_norm": 1.5959182977676392, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8549613952636719, + "num_tokens": 90973970.0, + "step": 2381 + }, + { + "epoch": 0.3030148836025951, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 1.4801814556121826, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8688904047012329, + "num_tokens": 91015390.0, + "step": 2382 + }, + { + "epoch": 0.3031420938811856, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.9883852005004883, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8363497853279114, + "num_tokens": 91050625.0, + "step": 2383 + }, + { + "epoch": 0.3032693041597761, + "ewc_loss": 9.655952453613281e-06, + "grad_norm": 1.6784330606460571, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8422824740409851, + "num_tokens": 91089899.0, + "step": 2384 + }, + { + "epoch": 0.3033965144383666, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.6636815071105957, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8560298681259155, + "num_tokens": 91128375.0, + "step": 2385 + }, + { + "epoch": 0.30352372471695716, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.6419109106063843, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8476284146308899, + "num_tokens": 91169919.0, + "step": 2386 + }, + { + "epoch": 0.30365093499554763, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 1.6783262491226196, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8568269610404968, + "num_tokens": 91205901.0, + "step": 2387 + }, + { + "epoch": 0.30377814527413816, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.5673507452011108, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8710871934890747, + "num_tokens": 91244604.0, + "step": 2388 + }, + { + "epoch": 0.3039053555527287, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.566299319267273, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8658095598220825, + "num_tokens": 91286497.0, + "step": 2389 + }, + { + "epoch": 0.30403256583131916, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.6053791046142578, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8371005058288574, + "num_tokens": 91328745.0, + "step": 2390 + }, + { + "epoch": 0.3041597761099097, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 1.5866661071777344, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8592313528060913, + "num_tokens": 91365114.0, + "step": 2391 + }, + { + "epoch": 0.3042869863885002, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.666154384613037, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8468710780143738, + "num_tokens": 91399609.0, + "step": 2392 + }, + { + "epoch": 0.3044141966670907, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.5129307508468628, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8582613468170166, + "num_tokens": 91442075.0, + "step": 2393 + }, + { + "epoch": 0.3045414069456812, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.7217066287994385, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8542237281799316, + "num_tokens": 91478733.0, + "step": 2394 + }, + { + "epoch": 0.30466861722427174, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.7284266948699951, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8411486148834229, + "num_tokens": 91514121.0, + "step": 2395 + }, + { + "epoch": 0.3047958275028622, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.5916346311569214, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8386549949645996, + "num_tokens": 91554374.0, + "step": 2396 + }, + { + "epoch": 0.30492303778145274, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.5434794425964355, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8601114153862, + "num_tokens": 91594810.0, + "step": 2397 + }, + { + "epoch": 0.30505024806004327, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.7241253852844238, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8561021685600281, + "num_tokens": 91630070.0, + "step": 2398 + }, + { + "epoch": 0.30517745833863374, + "ewc_loss": 9.715557098388672e-06, + "grad_norm": 1.760489821434021, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8434390425682068, + "num_tokens": 91668405.0, + "step": 2399 + }, + { + "epoch": 0.3053046686172243, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 1.6255183219909668, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.842041015625, + "num_tokens": 91706187.0, + "step": 2400 + }, + { + "epoch": 0.3054318788958148, + "ewc_loss": 9.775161743164062e-06, + "grad_norm": 1.5193930864334106, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8618088960647583, + "num_tokens": 91745805.0, + "step": 2401 + }, + { + "epoch": 0.3055590891744053, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.5546332597732544, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8497970700263977, + "num_tokens": 91789835.0, + "step": 2402 + }, + { + "epoch": 0.3056862994529958, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.7117266654968262, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8524925112724304, + "num_tokens": 91823378.0, + "step": 2403 + }, + { + "epoch": 0.30581350973158633, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.5523030757904053, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.843419075012207, + "num_tokens": 91869542.0, + "step": 2404 + }, + { + "epoch": 0.3059407200101768, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.456598162651062, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8733246326446533, + "num_tokens": 91913946.0, + "step": 2405 + }, + { + "epoch": 0.30606793028876733, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.5802043676376343, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8582119941711426, + "num_tokens": 91951326.0, + "step": 2406 + }, + { + "epoch": 0.30619514056735786, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.8414252996444702, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.866030216217041, + "num_tokens": 91984907.0, + "step": 2407 + }, + { + "epoch": 0.30632235084594833, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.7558470964431763, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8468625545501709, + "num_tokens": 92021917.0, + "step": 2408 + }, + { + "epoch": 0.30644956112453886, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.8494077920913696, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8590095639228821, + "num_tokens": 92052458.0, + "step": 2409 + }, + { + "epoch": 0.3065767714031294, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.7139482498168945, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8582616448402405, + "num_tokens": 92089734.0, + "step": 2410 + }, + { + "epoch": 0.30670398168171986, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.534177303314209, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8467555046081543, + "num_tokens": 92131720.0, + "step": 2411 + }, + { + "epoch": 0.3068311919603104, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.714464545249939, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.854057788848877, + "num_tokens": 92169072.0, + "step": 2412 + }, + { + "epoch": 0.3069584022389009, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.4939405918121338, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8532472252845764, + "num_tokens": 92211441.0, + "step": 2413 + }, + { + "epoch": 0.3070856125174914, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.471003532409668, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8616763949394226, + "num_tokens": 92258355.0, + "step": 2414 + }, + { + "epoch": 0.3072128227960819, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.8301059007644653, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8631897568702698, + "num_tokens": 92292524.0, + "step": 2415 + }, + { + "epoch": 0.30734003307467245, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.8470717668533325, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8489759564399719, + "num_tokens": 92327285.0, + "step": 2416 + }, + { + "epoch": 0.3074672433532629, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.5986343622207642, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8529677391052246, + "num_tokens": 92368342.0, + "step": 2417 + }, + { + "epoch": 0.30759445363185345, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.5868083238601685, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8552060127258301, + "num_tokens": 92410957.0, + "step": 2418 + }, + { + "epoch": 0.307721663910444, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.9375481605529785, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8574709296226501, + "num_tokens": 92442830.0, + "step": 2419 + }, + { + "epoch": 0.30784887418903445, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.690565586090088, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8532293438911438, + "num_tokens": 92484256.0, + "step": 2420 + }, + { + "epoch": 0.307976084467625, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.6081398725509644, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8615481853485107, + "num_tokens": 92524869.0, + "step": 2421 + }, + { + "epoch": 0.3081032947462155, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.57149338722229, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8573342561721802, + "num_tokens": 92564154.0, + "step": 2422 + }, + { + "epoch": 0.308230505024806, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.5681483745574951, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.865450382232666, + "num_tokens": 92602259.0, + "step": 2423 + }, + { + "epoch": 0.3083577153033965, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.6724412441253662, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.855185866355896, + "num_tokens": 92640852.0, + "step": 2424 + }, + { + "epoch": 0.30848492558198704, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.580230474472046, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.856094241142273, + "num_tokens": 92681895.0, + "step": 2425 + }, + { + "epoch": 0.3086121358605775, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.538344144821167, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8694247603416443, + "num_tokens": 92724614.0, + "step": 2426 + }, + { + "epoch": 0.30873934613916804, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.7842985391616821, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8476651310920715, + "num_tokens": 92760338.0, + "step": 2427 + }, + { + "epoch": 0.30886655641775856, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.5927417278289795, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.855346143245697, + "num_tokens": 92796817.0, + "step": 2428 + }, + { + "epoch": 0.3089937666963491, + "ewc_loss": 9.834766387939453e-06, + "grad_norm": 1.5002135038375854, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8559467792510986, + "num_tokens": 92836883.0, + "step": 2429 + }, + { + "epoch": 0.30912097697493957, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.4956120252609253, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8711735606193542, + "num_tokens": 92876037.0, + "step": 2430 + }, + { + "epoch": 0.3092481872535301, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.545042634010315, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.873215913772583, + "num_tokens": 92911269.0, + "step": 2431 + }, + { + "epoch": 0.3093753975321206, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.4911348819732666, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8653682470321655, + "num_tokens": 92953715.0, + "step": 2432 + }, + { + "epoch": 0.3095026078107111, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.5958032608032227, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8557072281837463, + "num_tokens": 92994585.0, + "step": 2433 + }, + { + "epoch": 0.3096298180893016, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.6645339727401733, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8663049340248108, + "num_tokens": 93031268.0, + "step": 2434 + }, + { + "epoch": 0.30975702836789215, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.4614511728286743, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8742932677268982, + "num_tokens": 93072315.0, + "step": 2435 + }, + { + "epoch": 0.3098842386464826, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.5598379373550415, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8569619655609131, + "num_tokens": 93113822.0, + "step": 2436 + }, + { + "epoch": 0.31001144892507315, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.8105257749557495, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8775315284729004, + "num_tokens": 93146035.0, + "step": 2437 + }, + { + "epoch": 0.3101386592036637, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.6903631687164307, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8538142442703247, + "num_tokens": 93182061.0, + "step": 2438 + }, + { + "epoch": 0.31026586948225415, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.639884114265442, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8731957674026489, + "num_tokens": 93216223.0, + "step": 2439 + }, + { + "epoch": 0.3103930797608447, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.6971931457519531, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8649989366531372, + "num_tokens": 93254688.0, + "step": 2440 + }, + { + "epoch": 0.3105202900394352, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.8855499029159546, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8673382997512817, + "num_tokens": 93290928.0, + "step": 2441 + }, + { + "epoch": 0.3106475003180257, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.6799755096435547, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8594757318496704, + "num_tokens": 93329515.0, + "step": 2442 + }, + { + "epoch": 0.3107747105966162, + "ewc_loss": 9.894371032714844e-06, + "grad_norm": 1.583317518234253, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.86721271276474, + "num_tokens": 93370158.0, + "step": 2443 + }, + { + "epoch": 0.31090192087520674, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.5113083124160767, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8539820909500122, + "num_tokens": 93413952.0, + "step": 2444 + }, + { + "epoch": 0.3110291311537972, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.58826744556427, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8424316048622131, + "num_tokens": 93457801.0, + "step": 2445 + }, + { + "epoch": 0.31115634143238774, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.6829307079315186, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8618224263191223, + "num_tokens": 93495392.0, + "step": 2446 + }, + { + "epoch": 0.31128355171097827, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.9025218486785889, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.855043888092041, + "num_tokens": 93529454.0, + "step": 2447 + }, + { + "epoch": 0.31141076198956874, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.8059982061386108, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.832689106464386, + "num_tokens": 93571406.0, + "step": 2448 + }, + { + "epoch": 0.31153797226815927, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.5557255744934082, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8520078659057617, + "num_tokens": 93612717.0, + "step": 2449 + }, + { + "epoch": 0.3116651825467498, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.7364851236343384, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8449424505233765, + "num_tokens": 93647556.0, + "step": 2450 + }, + { + "epoch": 0.31179239282534027, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.6685365438461304, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8647283315658569, + "num_tokens": 93687620.0, + "step": 2451 + }, + { + "epoch": 0.3119196031039308, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.6468173265457153, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8557648658752441, + "num_tokens": 93729012.0, + "step": 2452 + }, + { + "epoch": 0.3120468133825213, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.60404634475708, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.846670389175415, + "num_tokens": 93768404.0, + "step": 2453 + }, + { + "epoch": 0.3121740236611118, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 1.5291541814804077, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8573779463768005, + "num_tokens": 93809936.0, + "step": 2454 + }, + { + "epoch": 0.3123012339397023, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.5499203205108643, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.85393226146698, + "num_tokens": 93850462.0, + "step": 2455 + }, + { + "epoch": 0.31242844421829286, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.7023824453353882, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.848357081413269, + "num_tokens": 93886791.0, + "step": 2456 + }, + { + "epoch": 0.31255565449688333, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.7455087900161743, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8669724464416504, + "num_tokens": 93920180.0, + "step": 2457 + }, + { + "epoch": 0.31268286477547386, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.7866835594177246, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8520342707633972, + "num_tokens": 93954636.0, + "step": 2458 + }, + { + "epoch": 0.3128100750540644, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.6509335041046143, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8602005243301392, + "num_tokens": 93988258.0, + "step": 2459 + }, + { + "epoch": 0.31293728533265486, + "ewc_loss": 9.953975677490234e-06, + "grad_norm": 1.458425760269165, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8527159690856934, + "num_tokens": 94031487.0, + "step": 2460 + }, + { + "epoch": 0.3130644956112454, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.7684276103973389, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8512704372406006, + "num_tokens": 94065239.0, + "step": 2461 + }, + { + "epoch": 0.3131917058898359, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 1.619899034500122, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8591831922531128, + "num_tokens": 94103510.0, + "step": 2462 + }, + { + "epoch": 0.3133189161684264, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 1.7005285024642944, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8532654643058777, + "num_tokens": 94142460.0, + "step": 2463 + }, + { + "epoch": 0.3134461264470169, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 1.6363627910614014, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8646090030670166, + "num_tokens": 94178955.0, + "step": 2464 + }, + { + "epoch": 0.31357333672560744, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 1.6208776235580444, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8506635427474976, + "num_tokens": 94223827.0, + "step": 2465 + }, + { + "epoch": 0.3137005470041979, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 1.7129403352737427, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8382200002670288, + "num_tokens": 94260291.0, + "step": 2466 + }, + { + "epoch": 0.31382775728278844, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5905741453170776, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8570321202278137, + "num_tokens": 94299106.0, + "step": 2467 + }, + { + "epoch": 0.313954967561379, + "ewc_loss": 1.0013580322265625e-05, + "grad_norm": 1.562027096748352, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8478699922561646, + "num_tokens": 94340015.0, + "step": 2468 + }, + { + "epoch": 0.31408217783996945, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.7045314311981201, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8481054902076721, + "num_tokens": 94376368.0, + "step": 2469 + }, + { + "epoch": 0.31420938811856, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6766574382781982, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8669761419296265, + "num_tokens": 94412899.0, + "step": 2470 + }, + { + "epoch": 0.3143365983971505, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6278965473175049, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8573764562606812, + "num_tokens": 94445078.0, + "step": 2471 + }, + { + "epoch": 0.314463808675741, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5842700004577637, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8444525599479675, + "num_tokens": 94489193.0, + "step": 2472 + }, + { + "epoch": 0.3145910189543315, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6720677614212036, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8630334138870239, + "num_tokens": 94528627.0, + "step": 2473 + }, + { + "epoch": 0.31471822923292203, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.7187079191207886, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8553903698921204, + "num_tokens": 94564603.0, + "step": 2474 + }, + { + "epoch": 0.3148454395115125, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5187162160873413, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8411464691162109, + "num_tokens": 94608994.0, + "step": 2475 + }, + { + "epoch": 0.31497264979010303, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.4825994968414307, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8655217885971069, + "num_tokens": 94650525.0, + "step": 2476 + }, + { + "epoch": 0.31509986006869356, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6523977518081665, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8521925210952759, + "num_tokens": 94687067.0, + "step": 2477 + }, + { + "epoch": 0.31522707034728403, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.862113118171692, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8488028645515442, + "num_tokens": 94717231.0, + "step": 2478 + }, + { + "epoch": 0.31535428062587456, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.4929808378219604, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.862160325050354, + "num_tokens": 94764261.0, + "step": 2479 + }, + { + "epoch": 0.3154814909044651, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6256036758422852, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8602776527404785, + "num_tokens": 94798174.0, + "step": 2480 + }, + { + "epoch": 0.3156087011830556, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.4631654024124146, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.874113917350769, + "num_tokens": 94847234.0, + "step": 2481 + }, + { + "epoch": 0.3157359114616461, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5371973514556885, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8501168489456177, + "num_tokens": 94889212.0, + "step": 2482 + }, + { + "epoch": 0.3158631217402366, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5169202089309692, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8421184420585632, + "num_tokens": 94928852.0, + "step": 2483 + }, + { + "epoch": 0.31599033201882715, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.7063372135162354, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8492777347564697, + "num_tokens": 94964161.0, + "step": 2484 + }, + { + "epoch": 0.3161175422974176, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6046757698059082, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8501858711242676, + "num_tokens": 95005514.0, + "step": 2485 + }, + { + "epoch": 0.31624475257600815, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.627776026725769, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8623011112213135, + "num_tokens": 95043801.0, + "step": 2486 + }, + { + "epoch": 0.3163719628545987, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.7302427291870117, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8657943606376648, + "num_tokens": 95078211.0, + "step": 2487 + }, + { + "epoch": 0.31649917313318915, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6795121431350708, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8370506763458252, + "num_tokens": 95116841.0, + "step": 2488 + }, + { + "epoch": 0.3166263834117797, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.7386775016784668, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.84561687707901, + "num_tokens": 95154284.0, + "step": 2489 + }, + { + "epoch": 0.3167535936903702, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6576781272888184, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8578829765319824, + "num_tokens": 95188919.0, + "step": 2490 + }, + { + "epoch": 0.3168808039689607, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5729385614395142, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8550962209701538, + "num_tokens": 95231206.0, + "step": 2491 + }, + { + "epoch": 0.3170080142475512, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.4849867820739746, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8545159101486206, + "num_tokens": 95276163.0, + "step": 2492 + }, + { + "epoch": 0.31713522452614173, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6575556993484497, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8645889759063721, + "num_tokens": 95311024.0, + "step": 2493 + }, + { + "epoch": 0.3172624348047322, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6290132999420166, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8653294444084167, + "num_tokens": 95346134.0, + "step": 2494 + }, + { + "epoch": 0.31738964508332274, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6767891645431519, + "learning_rate": 1e-06, + "loss": 0.5059, + "mean_token_accuracy": 0.832648515701294, + "num_tokens": 95383106.0, + "step": 2495 + }, + { + "epoch": 0.31751685536191326, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6924186944961548, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8440743684768677, + "num_tokens": 95419210.0, + "step": 2496 + }, + { + "epoch": 0.31764406564050374, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.4697633981704712, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8658710718154907, + "num_tokens": 95460803.0, + "step": 2497 + }, + { + "epoch": 0.31777127591909426, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5658559799194336, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8674554824829102, + "num_tokens": 95496599.0, + "step": 2498 + }, + { + "epoch": 0.3178984861976848, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.6477327346801758, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8695101737976074, + "num_tokens": 95529974.0, + "step": 2499 + }, + { + "epoch": 0.31802569647627527, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 1.953118085861206, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8482160568237305, + "num_tokens": 95567893.0, + "step": 2500 + }, + { + "epoch": 0.3181529067548658, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.698277235031128, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8467718362808228, + "num_tokens": 95602887.0, + "step": 2501 + }, + { + "epoch": 0.3182801170334563, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.590696930885315, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8512131571769714, + "num_tokens": 95645771.0, + "step": 2502 + }, + { + "epoch": 0.3184073273120468, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5626875162124634, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8633027076721191, + "num_tokens": 95684035.0, + "step": 2503 + }, + { + "epoch": 0.3185345375906373, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5303750038146973, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8775274753570557, + "num_tokens": 95722755.0, + "step": 2504 + }, + { + "epoch": 0.31866174786922785, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.7020351886749268, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8324922323226929, + "num_tokens": 95763675.0, + "step": 2505 + }, + { + "epoch": 0.3187889581478183, + "ewc_loss": 1.0073184967041016e-05, + "grad_norm": 1.5345900058746338, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8573639988899231, + "num_tokens": 95805596.0, + "step": 2506 + }, + { + "epoch": 0.31891616842640885, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.6045482158660889, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.843607485294342, + "num_tokens": 95845210.0, + "step": 2507 + }, + { + "epoch": 0.3190433787049994, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.5699377059936523, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8616513609886169, + "num_tokens": 95886332.0, + "step": 2508 + }, + { + "epoch": 0.31917058898358985, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.5688588619232178, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.839798092842102, + "num_tokens": 95925621.0, + "step": 2509 + }, + { + "epoch": 0.3192977992621804, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.6521767377853394, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8491315841674805, + "num_tokens": 95964900.0, + "step": 2510 + }, + { + "epoch": 0.3194250095407709, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.550369381904602, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8555434942245483, + "num_tokens": 96007582.0, + "step": 2511 + }, + { + "epoch": 0.3195522198193614, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.6268668174743652, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8672386407852173, + "num_tokens": 96044233.0, + "step": 2512 + }, + { + "epoch": 0.3196794300979519, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 3.681908130645752, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8716888427734375, + "num_tokens": 96079406.0, + "step": 2513 + }, + { + "epoch": 0.31980664037654244, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.7176316976547241, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8569029569625854, + "num_tokens": 96116030.0, + "step": 2514 + }, + { + "epoch": 0.3199338506551329, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5065311193466187, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8631814122200012, + "num_tokens": 96157106.0, + "step": 2515 + }, + { + "epoch": 0.32006106093372344, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.6232774257659912, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8538309335708618, + "num_tokens": 96192317.0, + "step": 2516 + }, + { + "epoch": 0.32018827121231397, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.5317587852478027, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8478578329086304, + "num_tokens": 96234846.0, + "step": 2517 + }, + { + "epoch": 0.32031548149090444, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.9120099544525146, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8661084175109863, + "num_tokens": 96269444.0, + "step": 2518 + }, + { + "epoch": 0.32044269176949497, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 1.7393031120300293, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8429574370384216, + "num_tokens": 96304690.0, + "step": 2519 + }, + { + "epoch": 0.3205699020480855, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 1.6418718099594116, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8566120862960815, + "num_tokens": 96343285.0, + "step": 2520 + }, + { + "epoch": 0.32069711232667597, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.7005157470703125, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8552122116088867, + "num_tokens": 96379300.0, + "step": 2521 + }, + { + "epoch": 0.3208243226052665, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.5853277444839478, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8618910312652588, + "num_tokens": 96417524.0, + "step": 2522 + }, + { + "epoch": 0.320951532883857, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.4935921430587769, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.875288188457489, + "num_tokens": 96459385.0, + "step": 2523 + }, + { + "epoch": 0.3210787431624475, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 1.446160912513733, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8733160495758057, + "num_tokens": 96501776.0, + "step": 2524 + }, + { + "epoch": 0.32120595344103803, + "ewc_loss": 1.0192394256591797e-05, + "grad_norm": 1.5974125862121582, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8704798221588135, + "num_tokens": 96538903.0, + "step": 2525 + }, + { + "epoch": 0.32133316371962856, + "ewc_loss": 1.0132789611816406e-05, + "grad_norm": 1.6452215909957886, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8471555709838867, + "num_tokens": 96575575.0, + "step": 2526 + }, + { + "epoch": 0.32146037399821903, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.4745597839355469, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.869218111038208, + "num_tokens": 96619904.0, + "step": 2527 + }, + { + "epoch": 0.32158758427680956, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.7412538528442383, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8580887317657471, + "num_tokens": 96652614.0, + "step": 2528 + }, + { + "epoch": 0.3217147945554001, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5224580764770508, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8646477460861206, + "num_tokens": 96690743.0, + "step": 2529 + }, + { + "epoch": 0.3218420048339906, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.535736083984375, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8415883183479309, + "num_tokens": 96730631.0, + "step": 2530 + }, + { + "epoch": 0.3219692151125811, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5487525463104248, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8647570013999939, + "num_tokens": 96765440.0, + "step": 2531 + }, + { + "epoch": 0.3220964253911716, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5584285259246826, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8443551063537598, + "num_tokens": 96807510.0, + "step": 2532 + }, + { + "epoch": 0.32222363566976214, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.6562719345092773, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8452973365783691, + "num_tokens": 96845661.0, + "step": 2533 + }, + { + "epoch": 0.3223508459483526, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5773158073425293, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8611788749694824, + "num_tokens": 96880776.0, + "step": 2534 + }, + { + "epoch": 0.32247805622694314, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.6017791032791138, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8605241179466248, + "num_tokens": 96917632.0, + "step": 2535 + }, + { + "epoch": 0.32260526650553367, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5509049892425537, + "learning_rate": 1e-06, + "loss": 0.5256, + "mean_token_accuracy": 0.83069908618927, + "num_tokens": 96964028.0, + "step": 2536 + }, + { + "epoch": 0.32273247678412414, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.7190488576889038, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8489620089530945, + "num_tokens": 97001634.0, + "step": 2537 + }, + { + "epoch": 0.3228596870627147, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.7361763715744019, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8470519781112671, + "num_tokens": 97037083.0, + "step": 2538 + }, + { + "epoch": 0.3229868973413052, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5902024507522583, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8538203239440918, + "num_tokens": 97076052.0, + "step": 2539 + }, + { + "epoch": 0.3231141076198957, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5491576194763184, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8664653301239014, + "num_tokens": 97111989.0, + "step": 2540 + }, + { + "epoch": 0.3232413178984862, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.57960844039917, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8532872200012207, + "num_tokens": 97153748.0, + "step": 2541 + }, + { + "epoch": 0.32336852817707673, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5206267833709717, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.876775860786438, + "num_tokens": 97194263.0, + "step": 2542 + }, + { + "epoch": 0.3234957384556672, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5810364484786987, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8694359064102173, + "num_tokens": 97234517.0, + "step": 2543 + }, + { + "epoch": 0.32362294873425773, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.5202348232269287, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8610621690750122, + "num_tokens": 97274177.0, + "step": 2544 + }, + { + "epoch": 0.32375015901284826, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.498090386390686, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.851298451423645, + "num_tokens": 97319139.0, + "step": 2545 + }, + { + "epoch": 0.32387736929143873, + "ewc_loss": 1.0251998901367188e-05, + "grad_norm": 1.7702250480651855, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.850685715675354, + "num_tokens": 97352621.0, + "step": 2546 + }, + { + "epoch": 0.32400457957002926, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 1.580386996269226, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8576815128326416, + "num_tokens": 97395353.0, + "step": 2547 + }, + { + "epoch": 0.3241317898486198, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.6195801496505737, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.845721960067749, + "num_tokens": 97437064.0, + "step": 2548 + }, + { + "epoch": 0.32425900012721026, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.6108099222183228, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8507415652275085, + "num_tokens": 97478152.0, + "step": 2549 + }, + { + "epoch": 0.3243862104058008, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 1.5223689079284668, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8438313007354736, + "num_tokens": 97520595.0, + "step": 2550 + }, + { + "epoch": 0.3245134206843913, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 1.484908938407898, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8587905764579773, + "num_tokens": 97564904.0, + "step": 2551 + }, + { + "epoch": 0.3246406309629818, + "ewc_loss": 1.0311603546142578e-05, + "grad_norm": 1.6115896701812744, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8570454120635986, + "num_tokens": 97604777.0, + "step": 2552 + }, + { + "epoch": 0.3247678412415723, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.704666018486023, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.856006383895874, + "num_tokens": 97643675.0, + "step": 2553 + }, + { + "epoch": 0.32489505152016285, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.642737865447998, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8678491115570068, + "num_tokens": 97679330.0, + "step": 2554 + }, + { + "epoch": 0.3250222617987533, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.6049517393112183, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8627566695213318, + "num_tokens": 97715937.0, + "step": 2555 + }, + { + "epoch": 0.32514947207734385, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.7646994590759277, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8337143063545227, + "num_tokens": 97752624.0, + "step": 2556 + }, + { + "epoch": 0.3252766823559344, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.4819841384887695, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.848945140838623, + "num_tokens": 97794914.0, + "step": 2557 + }, + { + "epoch": 0.32540389263452485, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.5922033786773682, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.858134925365448, + "num_tokens": 97838546.0, + "step": 2558 + }, + { + "epoch": 0.3255311029131154, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.6790556907653809, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8494039177894592, + "num_tokens": 97879251.0, + "step": 2559 + }, + { + "epoch": 0.3256583131917059, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.577496886253357, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8479762077331543, + "num_tokens": 97914659.0, + "step": 2560 + }, + { + "epoch": 0.3257855234702964, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.580581784248352, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8619786500930786, + "num_tokens": 97954500.0, + "step": 2561 + }, + { + "epoch": 0.3259127337488869, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.6348438262939453, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.860277533531189, + "num_tokens": 97989321.0, + "step": 2562 + }, + { + "epoch": 0.32603994402747744, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.65049409866333, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8728417158126831, + "num_tokens": 98023238.0, + "step": 2563 + }, + { + "epoch": 0.3261671543060679, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.6765217781066895, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8410664796829224, + "num_tokens": 98061576.0, + "step": 2564 + }, + { + "epoch": 0.32629436458465844, + "ewc_loss": 1.0371208190917969e-05, + "grad_norm": 1.703336477279663, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8713372349739075, + "num_tokens": 98094672.0, + "step": 2565 + }, + { + "epoch": 0.32642157486324896, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.607059121131897, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8539039492607117, + "num_tokens": 98134945.0, + "step": 2566 + }, + { + "epoch": 0.32654878514183944, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.5457229614257812, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8550934791564941, + "num_tokens": 98172916.0, + "step": 2567 + }, + { + "epoch": 0.32667599542042997, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.5561130046844482, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8783997893333435, + "num_tokens": 98210835.0, + "step": 2568 + }, + { + "epoch": 0.3268032056990205, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.6773645877838135, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8570352792739868, + "num_tokens": 98244127.0, + "step": 2569 + }, + { + "epoch": 0.32693041597761097, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.555250644683838, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8552913665771484, + "num_tokens": 98286249.0, + "step": 2570 + }, + { + "epoch": 0.3270576262562015, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 1.635841727256775, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8584122657775879, + "num_tokens": 98320395.0, + "step": 2571 + }, + { + "epoch": 0.327184836534792, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 1.5832037925720215, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.861183762550354, + "num_tokens": 98358920.0, + "step": 2572 + }, + { + "epoch": 0.3273120468133825, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 1.649978518486023, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8533490896224976, + "num_tokens": 98392616.0, + "step": 2573 + }, + { + "epoch": 0.327439257091973, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.5641677379608154, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8574549555778503, + "num_tokens": 98429547.0, + "step": 2574 + }, + { + "epoch": 0.32756646737056355, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.6083370447158813, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8388120532035828, + "num_tokens": 98470210.0, + "step": 2575 + }, + { + "epoch": 0.327693677649154, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.5806547403335571, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8631805181503296, + "num_tokens": 98507375.0, + "step": 2576 + }, + { + "epoch": 0.32782088792774455, + "ewc_loss": 1.049041748046875e-05, + "grad_norm": 1.5703176259994507, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8742099404335022, + "num_tokens": 98544177.0, + "step": 2577 + }, + { + "epoch": 0.3279480982063351, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 1.6147998571395874, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8564399480819702, + "num_tokens": 98584096.0, + "step": 2578 + }, + { + "epoch": 0.3280753084849256, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 1.7785661220550537, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8396589756011963, + "num_tokens": 98618507.0, + "step": 2579 + }, + { + "epoch": 0.3282025187635161, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 1.729910135269165, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.847854495048523, + "num_tokens": 98654752.0, + "step": 2580 + }, + { + "epoch": 0.3283297290421066, + "ewc_loss": 1.055002212524414e-05, + "grad_norm": 1.661676287651062, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.841778039932251, + "num_tokens": 98691761.0, + "step": 2581 + }, + { + "epoch": 0.32845693932069714, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 3.662672519683838, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8457739353179932, + "num_tokens": 98732660.0, + "step": 2582 + }, + { + "epoch": 0.3285841495992876, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.5946346521377563, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.860572099685669, + "num_tokens": 98770609.0, + "step": 2583 + }, + { + "epoch": 0.32871135987787814, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.5420559644699097, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.857208251953125, + "num_tokens": 98810235.0, + "step": 2584 + }, + { + "epoch": 0.32883857015646867, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.5100210905075073, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8589895963668823, + "num_tokens": 98848989.0, + "step": 2585 + }, + { + "epoch": 0.32896578043505914, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.5778120756149292, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8418598771095276, + "num_tokens": 98888467.0, + "step": 2586 + }, + { + "epoch": 0.32909299071364967, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.7052193880081177, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8498894572257996, + "num_tokens": 98924607.0, + "step": 2587 + }, + { + "epoch": 0.3292202009922402, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.607325792312622, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8515283465385437, + "num_tokens": 98963455.0, + "step": 2588 + }, + { + "epoch": 0.32934741127083067, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.7949049472808838, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8505387902259827, + "num_tokens": 98994900.0, + "step": 2589 + }, + { + "epoch": 0.3294746215494212, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.546350359916687, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8741409778594971, + "num_tokens": 99032391.0, + "step": 2590 + }, + { + "epoch": 0.3296018318280117, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.686072587966919, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8543970584869385, + "num_tokens": 99067149.0, + "step": 2591 + }, + { + "epoch": 0.3297290421066022, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.5879864692687988, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8633018136024475, + "num_tokens": 99105513.0, + "step": 2592 + }, + { + "epoch": 0.3298562523851927, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.6789706945419312, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8604533076286316, + "num_tokens": 99136925.0, + "step": 2593 + }, + { + "epoch": 0.32998346266378326, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.6423113346099854, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8514407873153687, + "num_tokens": 99175145.0, + "step": 2594 + }, + { + "epoch": 0.33011067294237373, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.65794837474823, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.857611894607544, + "num_tokens": 99211098.0, + "step": 2595 + }, + { + "epoch": 0.33023788322096426, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.6489100456237793, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8603532314300537, + "num_tokens": 99247649.0, + "step": 2596 + }, + { + "epoch": 0.3303650934995548, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.7793594598770142, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8625630736351013, + "num_tokens": 99279423.0, + "step": 2597 + }, + { + "epoch": 0.33049230377814526, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.6572242975234985, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8538984656333923, + "num_tokens": 99317262.0, + "step": 2598 + }, + { + "epoch": 0.3306195140567358, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.6869012117385864, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8559857606887817, + "num_tokens": 99358055.0, + "step": 2599 + }, + { + "epoch": 0.3307467243353263, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.601169228553772, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8616990447044373, + "num_tokens": 99395998.0, + "step": 2600 + }, + { + "epoch": 0.3308739346139168, + "ewc_loss": 1.0669231414794922e-05, + "grad_norm": 1.5323951244354248, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8709616661071777, + "num_tokens": 99434237.0, + "step": 2601 + }, + { + "epoch": 0.3310011448925073, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.5072238445281982, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8564186096191406, + "num_tokens": 99471203.0, + "step": 2602 + }, + { + "epoch": 0.33112835517109784, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.657065987586975, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8450921177864075, + "num_tokens": 99507275.0, + "step": 2603 + }, + { + "epoch": 0.3312555654496883, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.785414457321167, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8605079650878906, + "num_tokens": 99538228.0, + "step": 2604 + }, + { + "epoch": 0.33138277572827884, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.7115607261657715, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8620686531066895, + "num_tokens": 99575086.0, + "step": 2605 + }, + { + "epoch": 0.3315099860068694, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.5879020690917969, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8721931576728821, + "num_tokens": 99616217.0, + "step": 2606 + }, + { + "epoch": 0.33163719628545985, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.6720647811889648, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8505704998970032, + "num_tokens": 99651550.0, + "step": 2607 + }, + { + "epoch": 0.3317644065640504, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 3.726717948913574, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8517036437988281, + "num_tokens": 99688903.0, + "step": 2608 + }, + { + "epoch": 0.3318916168426409, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.6682013273239136, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8371264338493347, + "num_tokens": 99728120.0, + "step": 2609 + }, + { + "epoch": 0.3320188271212314, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.561608076095581, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8626264333724976, + "num_tokens": 99766561.0, + "step": 2610 + }, + { + "epoch": 0.3321460373998219, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.6238946914672852, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8469239473342896, + "num_tokens": 99805342.0, + "step": 2611 + }, + { + "epoch": 0.33227324767841243, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.5982921123504639, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8688662052154541, + "num_tokens": 99846589.0, + "step": 2612 + }, + { + "epoch": 0.3324004579570029, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.5907491445541382, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8661547899246216, + "num_tokens": 99883251.0, + "step": 2613 + }, + { + "epoch": 0.33252766823559343, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.6238547563552856, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8410196304321289, + "num_tokens": 99921471.0, + "step": 2614 + }, + { + "epoch": 0.33265487851418396, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.8325567245483398, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8617985248565674, + "num_tokens": 99951065.0, + "step": 2615 + }, + { + "epoch": 0.33278208879277443, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.7482973337173462, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8769035339355469, + "num_tokens": 99987392.0, + "step": 2616 + }, + { + "epoch": 0.33290929907136496, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.6086735725402832, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8581948280334473, + "num_tokens": 100029920.0, + "step": 2617 + }, + { + "epoch": 0.3330365093499555, + "ewc_loss": 1.0728836059570312e-05, + "grad_norm": 1.680698275566101, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8684190511703491, + "num_tokens": 100067858.0, + "step": 2618 + }, + { + "epoch": 0.33316371962854596, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.5128813982009888, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8652380108833313, + "num_tokens": 100109579.0, + "step": 2619 + }, + { + "epoch": 0.3332909299071365, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.6102759838104248, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8587778806686401, + "num_tokens": 100146948.0, + "step": 2620 + }, + { + "epoch": 0.333418140185727, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.6065659523010254, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8663854598999023, + "num_tokens": 100180556.0, + "step": 2621 + }, + { + "epoch": 0.3335453504643175, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.7870619297027588, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8666654229164124, + "num_tokens": 100213094.0, + "step": 2622 + }, + { + "epoch": 0.333672560742908, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.5245811939239502, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8687564134597778, + "num_tokens": 100251133.0, + "step": 2623 + }, + { + "epoch": 0.33379977102149855, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.6835354566574097, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8578543663024902, + "num_tokens": 100284800.0, + "step": 2624 + }, + { + "epoch": 0.333926981300089, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.625450611114502, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.854719877243042, + "num_tokens": 100320477.0, + "step": 2625 + }, + { + "epoch": 0.33405419157867955, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.7351380586624146, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8419123888015747, + "num_tokens": 100356422.0, + "step": 2626 + }, + { + "epoch": 0.3341814018572701, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.7663191556930542, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8367912173271179, + "num_tokens": 100391131.0, + "step": 2627 + }, + { + "epoch": 0.33430861213586055, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.6015629768371582, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.855656623840332, + "num_tokens": 100429525.0, + "step": 2628 + }, + { + "epoch": 0.3344358224144511, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.6064468622207642, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8384875655174255, + "num_tokens": 100470035.0, + "step": 2629 + }, + { + "epoch": 0.3345630326930416, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.661985158920288, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8566441535949707, + "num_tokens": 100506096.0, + "step": 2630 + }, + { + "epoch": 0.33469024297163213, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.5640724897384644, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8653632402420044, + "num_tokens": 100548661.0, + "step": 2631 + }, + { + "epoch": 0.3348174532502226, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.6421446800231934, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8399618864059448, + "num_tokens": 100586456.0, + "step": 2632 + }, + { + "epoch": 0.33494466352881314, + "ewc_loss": 1.0788440704345703e-05, + "grad_norm": 1.586815595626831, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8707253932952881, + "num_tokens": 100623743.0, + "step": 2633 + }, + { + "epoch": 0.33507187380740366, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.6976211071014404, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8535369634628296, + "num_tokens": 100661097.0, + "step": 2634 + }, + { + "epoch": 0.33519908408599414, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5932190418243408, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8687038421630859, + "num_tokens": 100698956.0, + "step": 2635 + }, + { + "epoch": 0.33532629436458466, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5666706562042236, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8695566654205322, + "num_tokens": 100736831.0, + "step": 2636 + }, + { + "epoch": 0.3354535046431752, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.627058506011963, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8477281332015991, + "num_tokens": 100773884.0, + "step": 2637 + }, + { + "epoch": 0.33558071492176567, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5971988439559937, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8629975318908691, + "num_tokens": 100814181.0, + "step": 2638 + }, + { + "epoch": 0.3357079252003562, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.6978744268417358, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8499642610549927, + "num_tokens": 100853983.0, + "step": 2639 + }, + { + "epoch": 0.3358351354789467, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.56113600730896, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8680188655853271, + "num_tokens": 100890179.0, + "step": 2640 + }, + { + "epoch": 0.3359623457575372, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.7987380027770996, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8515472412109375, + "num_tokens": 100924924.0, + "step": 2641 + }, + { + "epoch": 0.3360895560361277, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.7287672758102417, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8504325151443481, + "num_tokens": 100962534.0, + "step": 2642 + }, + { + "epoch": 0.33621676631471825, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5544782876968384, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8592077493667603, + "num_tokens": 101002488.0, + "step": 2643 + }, + { + "epoch": 0.3363439765933087, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.4291694164276123, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8594194054603577, + "num_tokens": 101048781.0, + "step": 2644 + }, + { + "epoch": 0.33647118687189925, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5637569427490234, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8669536709785461, + "num_tokens": 101087836.0, + "step": 2645 + }, + { + "epoch": 0.3365983971504898, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.6115381717681885, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8672273755073547, + "num_tokens": 101122625.0, + "step": 2646 + }, + { + "epoch": 0.33672560742908025, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.7400153875350952, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8644009828567505, + "num_tokens": 101153167.0, + "step": 2647 + }, + { + "epoch": 0.3368528177076708, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.534529209136963, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8673847913742065, + "num_tokens": 101189568.0, + "step": 2648 + }, + { + "epoch": 0.3369800279862613, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5429507493972778, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8579434752464294, + "num_tokens": 101228636.0, + "step": 2649 + }, + { + "epoch": 0.3371072382648518, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5311084985733032, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8661791086196899, + "num_tokens": 101269547.0, + "step": 2650 + }, + { + "epoch": 0.3372344485434423, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5029380321502686, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8640121221542358, + "num_tokens": 101310685.0, + "step": 2651 + }, + { + "epoch": 0.33736165882203284, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.6899995803833008, + "learning_rate": 1e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8259786367416382, + "num_tokens": 101351026.0, + "step": 2652 + }, + { + "epoch": 0.3374888691006233, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.6431341171264648, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8539798259735107, + "num_tokens": 101388344.0, + "step": 2653 + }, + { + "epoch": 0.33761607937921384, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.657787799835205, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8624196648597717, + "num_tokens": 101425494.0, + "step": 2654 + }, + { + "epoch": 0.33774328965780437, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.540266513824463, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8649601936340332, + "num_tokens": 101466328.0, + "step": 2655 + }, + { + "epoch": 0.33787049993639484, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.6067185401916504, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8609682321548462, + "num_tokens": 101508209.0, + "step": 2656 + }, + { + "epoch": 0.33799771021498537, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.8038047552108765, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8429677486419678, + "num_tokens": 101541472.0, + "step": 2657 + }, + { + "epoch": 0.3381249204935759, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.5294495820999146, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8426637649536133, + "num_tokens": 101587582.0, + "step": 2658 + }, + { + "epoch": 0.33825213077216637, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.731290578842163, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.840255856513977, + "num_tokens": 101624173.0, + "step": 2659 + }, + { + "epoch": 0.3383793410507569, + "ewc_loss": 1.0848045349121094e-05, + "grad_norm": 1.7298636436462402, + "learning_rate": 1e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.8286223411560059, + "num_tokens": 101658997.0, + "step": 2660 + }, + { + "epoch": 0.3385065513293474, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 1.8148407936096191, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.858008623123169, + "num_tokens": 101690880.0, + "step": 2661 + }, + { + "epoch": 0.3386337616079379, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 1.4269585609436035, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8701947927474976, + "num_tokens": 101732789.0, + "step": 2662 + }, + { + "epoch": 0.33876097188652843, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 1.5923075675964355, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8519221544265747, + "num_tokens": 101775000.0, + "step": 2663 + }, + { + "epoch": 0.33888818216511896, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 1.5844182968139648, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8485345840454102, + "num_tokens": 101813386.0, + "step": 2664 + }, + { + "epoch": 0.33901539244370943, + "ewc_loss": 1.0967254638671875e-05, + "grad_norm": 1.6631817817687988, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8503468036651611, + "num_tokens": 101846390.0, + "step": 2665 + }, + { + "epoch": 0.33914260272229996, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 1.5748724937438965, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8557712435722351, + "num_tokens": 101885307.0, + "step": 2666 + }, + { + "epoch": 0.3392698130008905, + "ewc_loss": 1.1026859283447266e-05, + "grad_norm": 1.714999794960022, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8428927659988403, + "num_tokens": 101923319.0, + "step": 2667 + }, + { + "epoch": 0.33939702327948096, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6528748273849487, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8488426208496094, + "num_tokens": 101959063.0, + "step": 2668 + }, + { + "epoch": 0.3395242335580715, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.5597443580627441, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8801937699317932, + "num_tokens": 101996900.0, + "step": 2669 + }, + { + "epoch": 0.339651443836662, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6946080923080444, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8441461324691772, + "num_tokens": 102034863.0, + "step": 2670 + }, + { + "epoch": 0.3397786541152525, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.481988549232483, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8515709638595581, + "num_tokens": 102075527.0, + "step": 2671 + }, + { + "epoch": 0.339905864393843, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6753647327423096, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8395024538040161, + "num_tokens": 102114926.0, + "step": 2672 + }, + { + "epoch": 0.34003307467243354, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.4826921224594116, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8511115312576294, + "num_tokens": 102158362.0, + "step": 2673 + }, + { + "epoch": 0.340160284951024, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.499377727508545, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8585660457611084, + "num_tokens": 102201105.0, + "step": 2674 + }, + { + "epoch": 0.34028749522961454, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6239618062973022, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8767497539520264, + "num_tokens": 102233312.0, + "step": 2675 + }, + { + "epoch": 0.3404147055082051, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.7988582849502563, + "learning_rate": 1e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.830870509147644, + "num_tokens": 102267342.0, + "step": 2676 + }, + { + "epoch": 0.34054191578679555, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.4485411643981934, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8741400241851807, + "num_tokens": 102304590.0, + "step": 2677 + }, + { + "epoch": 0.3406691260653861, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.5351482629776, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.864978551864624, + "num_tokens": 102340882.0, + "step": 2678 + }, + { + "epoch": 0.3407963363439766, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.5942728519439697, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8655205965042114, + "num_tokens": 102376445.0, + "step": 2679 + }, + { + "epoch": 0.34092354662256713, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6497461795806885, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.85860276222229, + "num_tokens": 102413706.0, + "step": 2680 + }, + { + "epoch": 0.3410507569011576, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.606969952583313, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8490742444992065, + "num_tokens": 102452843.0, + "step": 2681 + }, + { + "epoch": 0.34117796717974813, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.7106237411499023, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8392388224601746, + "num_tokens": 102489094.0, + "step": 2682 + }, + { + "epoch": 0.34130517745833866, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.729838490486145, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8477334976196289, + "num_tokens": 102525873.0, + "step": 2683 + }, + { + "epoch": 0.34143238773692913, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.8284525871276855, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8528552651405334, + "num_tokens": 102560390.0, + "step": 2684 + }, + { + "epoch": 0.34155959801551966, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.4983919858932495, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8600605130195618, + "num_tokens": 102604455.0, + "step": 2685 + }, + { + "epoch": 0.3416868082941102, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.7051746845245361, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8663901090621948, + "num_tokens": 102641112.0, + "step": 2686 + }, + { + "epoch": 0.34181401857270066, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6470839977264404, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8562352657318115, + "num_tokens": 102679628.0, + "step": 2687 + }, + { + "epoch": 0.3419412288512912, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6669983863830566, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8612763285636902, + "num_tokens": 102714385.0, + "step": 2688 + }, + { + "epoch": 0.3420684391298817, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6915067434310913, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8698297739028931, + "num_tokens": 102751095.0, + "step": 2689 + }, + { + "epoch": 0.3421956494084722, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.655712366104126, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8628692626953125, + "num_tokens": 102784904.0, + "step": 2690 + }, + { + "epoch": 0.3423228596870627, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.3943852186203003, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8579068779945374, + "num_tokens": 102835232.0, + "step": 2691 + }, + { + "epoch": 0.34245006996565325, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.718949794769287, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8563913106918335, + "num_tokens": 102872663.0, + "step": 2692 + }, + { + "epoch": 0.3425772802442437, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.8180899620056152, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8519870042800903, + "num_tokens": 102908056.0, + "step": 2693 + }, + { + "epoch": 0.34270449052283425, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.80461585521698, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8701529502868652, + "num_tokens": 102944745.0, + "step": 2694 + }, + { + "epoch": 0.3428317008014248, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6313817501068115, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8672782182693481, + "num_tokens": 102985200.0, + "step": 2695 + }, + { + "epoch": 0.34295891108001525, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.512457251548767, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8692213296890259, + "num_tokens": 103029738.0, + "step": 2696 + }, + { + "epoch": 0.3430861213586058, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.874166488647461, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8407604694366455, + "num_tokens": 103065830.0, + "step": 2697 + }, + { + "epoch": 0.3432133316371963, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6579865217208862, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.855570375919342, + "num_tokens": 103111778.0, + "step": 2698 + }, + { + "epoch": 0.3433405419157868, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.5362154245376587, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.862994372844696, + "num_tokens": 103155477.0, + "step": 2699 + }, + { + "epoch": 0.3434677521943773, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.676194190979004, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8679343461990356, + "num_tokens": 103194670.0, + "step": 2700 + }, + { + "epoch": 0.34359496247296784, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.676590919494629, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8673889636993408, + "num_tokens": 103230914.0, + "step": 2701 + }, + { + "epoch": 0.3437221727515583, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6638281345367432, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8586427569389343, + "num_tokens": 103267826.0, + "step": 2702 + }, + { + "epoch": 0.34384938303014884, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.7019007205963135, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8550893068313599, + "num_tokens": 103301979.0, + "step": 2703 + }, + { + "epoch": 0.34397659330873936, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6120294332504272, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.835332453250885, + "num_tokens": 103344950.0, + "step": 2704 + }, + { + "epoch": 0.34410380358732984, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.685958981513977, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8593928217887878, + "num_tokens": 103382356.0, + "step": 2705 + }, + { + "epoch": 0.34423101386592037, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.5594710111618042, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8656413555145264, + "num_tokens": 103421393.0, + "step": 2706 + }, + { + "epoch": 0.3443582241445109, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.5784279108047485, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8635567426681519, + "num_tokens": 103460109.0, + "step": 2707 + }, + { + "epoch": 0.34448543442310137, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.633878469467163, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8656058311462402, + "num_tokens": 103496715.0, + "step": 2708 + }, + { + "epoch": 0.3446126447016919, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.585506796836853, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8480408787727356, + "num_tokens": 103539804.0, + "step": 2709 + }, + { + "epoch": 0.3447398549802824, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.5951205492019653, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8515252470970154, + "num_tokens": 103583945.0, + "step": 2710 + }, + { + "epoch": 0.3448670652588729, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6577954292297363, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8630272150039673, + "num_tokens": 103622391.0, + "step": 2711 + }, + { + "epoch": 0.3449942755374634, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.706871509552002, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8386673331260681, + "num_tokens": 103658889.0, + "step": 2712 + }, + { + "epoch": 0.34512148581605395, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6087967157363892, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8717759847640991, + "num_tokens": 103697035.0, + "step": 2713 + }, + { + "epoch": 0.3452486960946444, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.702986478805542, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8516055345535278, + "num_tokens": 103732261.0, + "step": 2714 + }, + { + "epoch": 0.34537590637323495, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6677913665771484, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8613897562026978, + "num_tokens": 103771266.0, + "step": 2715 + }, + { + "epoch": 0.3455031166518255, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.606870174407959, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8590020537376404, + "num_tokens": 103810735.0, + "step": 2716 + }, + { + "epoch": 0.34563032693041595, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.579150676727295, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8478326797485352, + "num_tokens": 103850194.0, + "step": 2717 + }, + { + "epoch": 0.3457575372090065, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6314961910247803, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.867060124874115, + "num_tokens": 103885770.0, + "step": 2718 + }, + { + "epoch": 0.345884747487597, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.57319176197052, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8528069853782654, + "num_tokens": 103927360.0, + "step": 2719 + }, + { + "epoch": 0.3460119577661875, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6036100387573242, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8619632124900818, + "num_tokens": 103967618.0, + "step": 2720 + }, + { + "epoch": 0.346139168044778, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.688081979751587, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8406107425689697, + "num_tokens": 104012242.0, + "step": 2721 + }, + { + "epoch": 0.34626637832336854, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.9610265493392944, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8426102995872498, + "num_tokens": 104043232.0, + "step": 2722 + }, + { + "epoch": 0.346393588601959, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.5711662769317627, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8596237897872925, + "num_tokens": 104084037.0, + "step": 2723 + }, + { + "epoch": 0.34652079888054954, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.4928479194641113, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8633465766906738, + "num_tokens": 104126213.0, + "step": 2724 + }, + { + "epoch": 0.34664800915914007, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.6004129648208618, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8545626997947693, + "num_tokens": 104169084.0, + "step": 2725 + }, + { + "epoch": 0.34677521943773054, + "ewc_loss": 1.1146068572998047e-05, + "grad_norm": 1.590063452720642, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8591623306274414, + "num_tokens": 104206212.0, + "step": 2726 + }, + { + "epoch": 0.34690242971632107, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.6825271844863892, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8640361428260803, + "num_tokens": 104246180.0, + "step": 2727 + }, + { + "epoch": 0.3470296399949116, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.4998867511749268, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8571522831916809, + "num_tokens": 104285627.0, + "step": 2728 + }, + { + "epoch": 0.3471568502735021, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.6716763973236084, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8712137937545776, + "num_tokens": 104319531.0, + "step": 2729 + }, + { + "epoch": 0.3472840605520926, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.595076084136963, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8629128932952881, + "num_tokens": 104357636.0, + "step": 2730 + }, + { + "epoch": 0.3474112708306831, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.5781642198562622, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.861133337020874, + "num_tokens": 104397106.0, + "step": 2731 + }, + { + "epoch": 0.34753848110927366, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.622559905052185, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8324847221374512, + "num_tokens": 104436758.0, + "step": 2732 + }, + { + "epoch": 0.34766569138786413, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.5986719131469727, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8627402186393738, + "num_tokens": 104473475.0, + "step": 2733 + }, + { + "epoch": 0.34779290166645466, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.784120798110962, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8451063632965088, + "num_tokens": 104509068.0, + "step": 2734 + }, + { + "epoch": 0.3479201119450452, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.7768715620040894, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8409344553947449, + "num_tokens": 104545766.0, + "step": 2735 + }, + { + "epoch": 0.34804732222363566, + "ewc_loss": 1.1205673217773438e-05, + "grad_norm": 1.7749778032302856, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8506237268447876, + "num_tokens": 104586403.0, + "step": 2736 + }, + { + "epoch": 0.3481745325022262, + "ewc_loss": 1.1265277862548828e-05, + "grad_norm": 1.5383803844451904, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8428893089294434, + "num_tokens": 104629630.0, + "step": 2737 + }, + { + "epoch": 0.3483017427808167, + "ewc_loss": 1.1265277862548828e-05, + "grad_norm": 1.5454891920089722, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8569157123565674, + "num_tokens": 104671113.0, + "step": 2738 + }, + { + "epoch": 0.3484289530594072, + "ewc_loss": 1.1265277862548828e-05, + "grad_norm": 1.6910597085952759, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8536943793296814, + "num_tokens": 104706287.0, + "step": 2739 + }, + { + "epoch": 0.3485561633379977, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.5761213302612305, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8543674945831299, + "num_tokens": 104748980.0, + "step": 2740 + }, + { + "epoch": 0.34868337361658824, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.6820698976516724, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8663352131843567, + "num_tokens": 104786718.0, + "step": 2741 + }, + { + "epoch": 0.3488105838951787, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.8811099529266357, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8419821262359619, + "num_tokens": 104818867.0, + "step": 2742 + }, + { + "epoch": 0.34893779417376924, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.8645453453063965, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8471356630325317, + "num_tokens": 104850782.0, + "step": 2743 + }, + { + "epoch": 0.3490650044523598, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.711007833480835, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8373774886131287, + "num_tokens": 104887394.0, + "step": 2744 + }, + { + "epoch": 0.34919221473095025, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.5923947095870972, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8632057905197144, + "num_tokens": 104923651.0, + "step": 2745 + }, + { + "epoch": 0.3493194250095408, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.683345079421997, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8476240634918213, + "num_tokens": 104960860.0, + "step": 2746 + }, + { + "epoch": 0.3494466352881313, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.762892246246338, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8581803441047668, + "num_tokens": 104996666.0, + "step": 2747 + }, + { + "epoch": 0.3495738455667218, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.449921727180481, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8679909706115723, + "num_tokens": 105039061.0, + "step": 2748 + }, + { + "epoch": 0.3497010558453123, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.7714978456497192, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8431992530822754, + "num_tokens": 105077125.0, + "step": 2749 + }, + { + "epoch": 0.34982826612390283, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.6564916372299194, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8558216691017151, + "num_tokens": 105111236.0, + "step": 2750 + }, + { + "epoch": 0.3499554764024933, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.7122833728790283, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8541890382766724, + "num_tokens": 105147763.0, + "step": 2751 + }, + { + "epoch": 0.35008268668108383, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.6066805124282837, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8542019128799438, + "num_tokens": 105188699.0, + "step": 2752 + }, + { + "epoch": 0.35020989695967436, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.457629680633545, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8736688494682312, + "num_tokens": 105229847.0, + "step": 2753 + }, + { + "epoch": 0.35033710723826483, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.5360825061798096, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.856603741645813, + "num_tokens": 105269286.0, + "step": 2754 + }, + { + "epoch": 0.35046431751685536, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.6669034957885742, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8608264923095703, + "num_tokens": 105308414.0, + "step": 2755 + }, + { + "epoch": 0.3505915277954459, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.6003599166870117, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8529109954833984, + "num_tokens": 105347296.0, + "step": 2756 + }, + { + "epoch": 0.35071873807403636, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.4669029712677002, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8701626062393188, + "num_tokens": 105388701.0, + "step": 2757 + }, + { + "epoch": 0.3508459483526269, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.7037352323532104, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8534877300262451, + "num_tokens": 105421865.0, + "step": 2758 + }, + { + "epoch": 0.3509731586312174, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.7062804698944092, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8562791347503662, + "num_tokens": 105457653.0, + "step": 2759 + }, + { + "epoch": 0.3511003689098079, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.5685979127883911, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8569068312644958, + "num_tokens": 105497925.0, + "step": 2760 + }, + { + "epoch": 0.3512275791883984, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.761386513710022, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8631463050842285, + "num_tokens": 105533685.0, + "step": 2761 + }, + { + "epoch": 0.35135478946698895, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.8361961841583252, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8492343425750732, + "num_tokens": 105565036.0, + "step": 2762 + }, + { + "epoch": 0.3514819997455794, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.5815128087997437, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8484145402908325, + "num_tokens": 105609815.0, + "step": 2763 + }, + { + "epoch": 0.35160921002416995, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.7719979286193848, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8411358594894409, + "num_tokens": 105643748.0, + "step": 2764 + }, + { + "epoch": 0.3517364203027605, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.5681803226470947, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8692324161529541, + "num_tokens": 105684607.0, + "step": 2765 + }, + { + "epoch": 0.35186363058135095, + "ewc_loss": 1.1324882507324219e-05, + "grad_norm": 1.5850732326507568, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8440374135971069, + "num_tokens": 105723834.0, + "step": 2766 + }, + { + "epoch": 0.3519908408599415, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5598224401474, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8554595708847046, + "num_tokens": 105765592.0, + "step": 2767 + }, + { + "epoch": 0.352118051138532, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.4918811321258545, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8619639873504639, + "num_tokens": 105808292.0, + "step": 2768 + }, + { + "epoch": 0.3522452614171225, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.4601577520370483, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8622081279754639, + "num_tokens": 105847842.0, + "step": 2769 + }, + { + "epoch": 0.352372471695713, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5834171772003174, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8631622791290283, + "num_tokens": 105890302.0, + "step": 2770 + }, + { + "epoch": 0.35249968197430354, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6042566299438477, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8529379367828369, + "num_tokens": 105924581.0, + "step": 2771 + }, + { + "epoch": 0.352626892252894, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5519863367080688, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.875952959060669, + "num_tokens": 105962598.0, + "step": 2772 + }, + { + "epoch": 0.35275410253148454, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.4465978145599365, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8761329650878906, + "num_tokens": 106002473.0, + "step": 2773 + }, + { + "epoch": 0.35288131281007507, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.492030382156372, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.871109127998352, + "num_tokens": 106039400.0, + "step": 2774 + }, + { + "epoch": 0.35300852308866554, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6879656314849854, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8711119294166565, + "num_tokens": 106074550.0, + "step": 2775 + }, + { + "epoch": 0.35313573336725607, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5341142416000366, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8682348728179932, + "num_tokens": 106114028.0, + "step": 2776 + }, + { + "epoch": 0.3532629436458466, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5415258407592773, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8683152198791504, + "num_tokens": 106153182.0, + "step": 2777 + }, + { + "epoch": 0.35339015392443707, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5608094930648804, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8820476531982422, + "num_tokens": 106190542.0, + "step": 2778 + }, + { + "epoch": 0.3535173642030276, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5637855529785156, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8567498922348022, + "num_tokens": 106230218.0, + "step": 2779 + }, + { + "epoch": 0.3536445744816181, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.627015233039856, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8567897081375122, + "num_tokens": 106268898.0, + "step": 2780 + }, + { + "epoch": 0.35377178476020865, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.7065051794052124, + "learning_rate": 1e-06, + "loss": 0.5191, + "mean_token_accuracy": 0.8373187780380249, + "num_tokens": 106309125.0, + "step": 2781 + }, + { + "epoch": 0.3538989950387991, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.313018321990967, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8489725589752197, + "num_tokens": 106343873.0, + "step": 2782 + }, + { + "epoch": 0.35402620531738965, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.72542142868042, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8467857837677002, + "num_tokens": 106378255.0, + "step": 2783 + }, + { + "epoch": 0.3541534155959802, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.675966501235962, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8640187978744507, + "num_tokens": 106415583.0, + "step": 2784 + }, + { + "epoch": 0.35428062587457065, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6488226652145386, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8595412969589233, + "num_tokens": 106452533.0, + "step": 2785 + }, + { + "epoch": 0.3544078361531612, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6501984596252441, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8604554533958435, + "num_tokens": 106489628.0, + "step": 2786 + }, + { + "epoch": 0.3545350464317517, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5842067003250122, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8634033799171448, + "num_tokens": 106532257.0, + "step": 2787 + }, + { + "epoch": 0.3546622567103422, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6869837045669556, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.854729175567627, + "num_tokens": 106569445.0, + "step": 2788 + }, + { + "epoch": 0.3547894669889327, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5233627557754517, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8569433689117432, + "num_tokens": 106609980.0, + "step": 2789 + }, + { + "epoch": 0.35491667726752324, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.8184468746185303, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8527705669403076, + "num_tokens": 106655390.0, + "step": 2790 + }, + { + "epoch": 0.3550438875461137, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6173397302627563, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8547401428222656, + "num_tokens": 106692711.0, + "step": 2791 + }, + { + "epoch": 0.35517109782470424, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5331549644470215, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8637865781784058, + "num_tokens": 106733118.0, + "step": 2792 + }, + { + "epoch": 0.35529830810329477, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5251461267471313, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8761513829231262, + "num_tokens": 106769806.0, + "step": 2793 + }, + { + "epoch": 0.35542551838188524, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5297391414642334, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8419594764709473, + "num_tokens": 106815314.0, + "step": 2794 + }, + { + "epoch": 0.35555272866047577, + "ewc_loss": 1.138448715209961e-05, + "grad_norm": 1.5850589275360107, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8567368984222412, + "num_tokens": 106853363.0, + "step": 2795 + }, + { + "epoch": 0.3556799389390663, + "ewc_loss": 1.138448715209961e-05, + "grad_norm": 1.571818232536316, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8765832185745239, + "num_tokens": 106888676.0, + "step": 2796 + }, + { + "epoch": 0.35580714921765677, + "ewc_loss": 1.138448715209961e-05, + "grad_norm": 1.5274620056152344, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8543581366539001, + "num_tokens": 106931761.0, + "step": 2797 + }, + { + "epoch": 0.3559343594962473, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5115482807159424, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.849745512008667, + "num_tokens": 106973178.0, + "step": 2798 + }, + { + "epoch": 0.3560615697748378, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6141538619995117, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.856386661529541, + "num_tokens": 107010158.0, + "step": 2799 + }, + { + "epoch": 0.3561887800534283, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6641623973846436, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8511053323745728, + "num_tokens": 107048337.0, + "step": 2800 + }, + { + "epoch": 0.35631599033201883, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5476080179214478, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8503372669219971, + "num_tokens": 107088004.0, + "step": 2801 + }, + { + "epoch": 0.35644320061060936, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.561998963356018, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8768407106399536, + "num_tokens": 107124993.0, + "step": 2802 + }, + { + "epoch": 0.35657041088919983, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 2.2392218112945557, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8468097448348999, + "num_tokens": 107168447.0, + "step": 2803 + }, + { + "epoch": 0.35669762116779036, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.542615294456482, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8586716055870056, + "num_tokens": 107212124.0, + "step": 2804 + }, + { + "epoch": 0.3568248314463809, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6790140867233276, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8520189523696899, + "num_tokens": 107247945.0, + "step": 2805 + }, + { + "epoch": 0.35695204172497136, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.7315731048583984, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8434870839118958, + "num_tokens": 107280945.0, + "step": 2806 + }, + { + "epoch": 0.3570792520035619, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5516246557235718, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8596708178520203, + "num_tokens": 107320318.0, + "step": 2807 + }, + { + "epoch": 0.3572064622821524, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5317860841751099, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8601512908935547, + "num_tokens": 107361961.0, + "step": 2808 + }, + { + "epoch": 0.3573336725607429, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6243743896484375, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8680658936500549, + "num_tokens": 107396584.0, + "step": 2809 + }, + { + "epoch": 0.3574608828393334, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.7881611585617065, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8635557889938354, + "num_tokens": 107432418.0, + "step": 2810 + }, + { + "epoch": 0.35758809311792394, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6100064516067505, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8596990704536438, + "num_tokens": 107473717.0, + "step": 2811 + }, + { + "epoch": 0.3577153033965144, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6188331842422485, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8519687652587891, + "num_tokens": 107513187.0, + "step": 2812 + }, + { + "epoch": 0.35784251367510495, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.66750967502594, + "learning_rate": 1e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8285186886787415, + "num_tokens": 107554290.0, + "step": 2813 + }, + { + "epoch": 0.3579697239536955, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5938012599945068, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.854378342628479, + "num_tokens": 107594771.0, + "step": 2814 + }, + { + "epoch": 0.35809693423228595, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6176304817199707, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.869234561920166, + "num_tokens": 107630818.0, + "step": 2815 + }, + { + "epoch": 0.3582241445108765, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.7585245370864868, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8640552163124084, + "num_tokens": 107666318.0, + "step": 2816 + }, + { + "epoch": 0.358351354789467, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.7141468524932861, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.871830940246582, + "num_tokens": 107701996.0, + "step": 2817 + }, + { + "epoch": 0.3584785650680575, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.7479650974273682, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8533633947372437, + "num_tokens": 107733551.0, + "step": 2818 + }, + { + "epoch": 0.358605775346648, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.669942855834961, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8544051051139832, + "num_tokens": 107770653.0, + "step": 2819 + }, + { + "epoch": 0.35873298562523853, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5248409509658813, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8476070165634155, + "num_tokens": 107816470.0, + "step": 2820 + }, + { + "epoch": 0.358860195903829, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5749483108520508, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8660115599632263, + "num_tokens": 107849856.0, + "step": 2821 + }, + { + "epoch": 0.35898740618241953, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.605698585510254, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8552014231681824, + "num_tokens": 107886898.0, + "step": 2822 + }, + { + "epoch": 0.35911461646101006, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.8599821329116821, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8414216041564941, + "num_tokens": 107920257.0, + "step": 2823 + }, + { + "epoch": 0.35924182673960053, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6184738874435425, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.856082022190094, + "num_tokens": 107955085.0, + "step": 2824 + }, + { + "epoch": 0.35936903701819106, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5587704181671143, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8613157272338867, + "num_tokens": 107993576.0, + "step": 2825 + }, + { + "epoch": 0.3594962472967816, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6858075857162476, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8507739305496216, + "num_tokens": 108031661.0, + "step": 2826 + }, + { + "epoch": 0.35962345757537206, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5502989292144775, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.864719808101654, + "num_tokens": 108067701.0, + "step": 2827 + }, + { + "epoch": 0.3597506678539626, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6767258644104004, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.841270923614502, + "num_tokens": 108106880.0, + "step": 2828 + }, + { + "epoch": 0.3598778781325531, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.424550175666809, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8677916526794434, + "num_tokens": 108150777.0, + "step": 2829 + }, + { + "epoch": 0.36000508841114365, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.5132747888565063, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8598034381866455, + "num_tokens": 108192240.0, + "step": 2830 + }, + { + "epoch": 0.3601322986897341, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6117475032806396, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8667593002319336, + "num_tokens": 108229716.0, + "step": 2831 + }, + { + "epoch": 0.36025950896832465, + "ewc_loss": 1.1444091796875e-05, + "grad_norm": 1.6346700191497803, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8530123829841614, + "num_tokens": 108269599.0, + "step": 2832 + }, + { + "epoch": 0.3603867192469152, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.7915209531784058, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8632872104644775, + "num_tokens": 108303799.0, + "step": 2833 + }, + { + "epoch": 0.36051392952550565, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.6738733053207397, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8589924573898315, + "num_tokens": 108341003.0, + "step": 2834 + }, + { + "epoch": 0.3606411398040962, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.453248381614685, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8653432130813599, + "num_tokens": 108385567.0, + "step": 2835 + }, + { + "epoch": 0.3607683500826867, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.4901005029678345, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8722388744354248, + "num_tokens": 108424847.0, + "step": 2836 + }, + { + "epoch": 0.3608955603612772, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.7904430627822876, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8595455884933472, + "num_tokens": 108456489.0, + "step": 2837 + }, + { + "epoch": 0.3610227706398677, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.598123550415039, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8710988759994507, + "num_tokens": 108492777.0, + "step": 2838 + }, + { + "epoch": 0.36114998091845824, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.699326753616333, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8440978527069092, + "num_tokens": 108529570.0, + "step": 2839 + }, + { + "epoch": 0.3612771911970487, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.7039364576339722, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.86351078748703, + "num_tokens": 108565236.0, + "step": 2840 + }, + { + "epoch": 0.36140440147563924, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.5997592210769653, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8639256954193115, + "num_tokens": 108608319.0, + "step": 2841 + }, + { + "epoch": 0.36153161175422976, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.598341941833496, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8573110103607178, + "num_tokens": 108650087.0, + "step": 2842 + }, + { + "epoch": 0.36165882203282024, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.6256992816925049, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8461976647377014, + "num_tokens": 108688438.0, + "step": 2843 + }, + { + "epoch": 0.36178603231141077, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.5570133924484253, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8504853248596191, + "num_tokens": 108728771.0, + "step": 2844 + }, + { + "epoch": 0.3619132425900013, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.7686673402786255, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8457547426223755, + "num_tokens": 108765038.0, + "step": 2845 + }, + { + "epoch": 0.36204045286859177, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.655868411064148, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8639233112335205, + "num_tokens": 108804247.0, + "step": 2846 + }, + { + "epoch": 0.3621676631471823, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.6617403030395508, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8498091101646423, + "num_tokens": 108839684.0, + "step": 2847 + }, + { + "epoch": 0.3622948734257728, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.646837830543518, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8669406175613403, + "num_tokens": 108880264.0, + "step": 2848 + }, + { + "epoch": 0.3624220837043633, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.710190773010254, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8612176179885864, + "num_tokens": 108918034.0, + "step": 2849 + }, + { + "epoch": 0.3625492939829538, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.6745446920394897, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8600167632102966, + "num_tokens": 108954616.0, + "step": 2850 + }, + { + "epoch": 0.36267650426154435, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.5703176259994507, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8574837446212769, + "num_tokens": 108993060.0, + "step": 2851 + }, + { + "epoch": 0.3628037145401348, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.5072818994522095, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.871342122554779, + "num_tokens": 109033050.0, + "step": 2852 + }, + { + "epoch": 0.36293092481872535, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.5987722873687744, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8539720773696899, + "num_tokens": 109075261.0, + "step": 2853 + }, + { + "epoch": 0.3630581350973159, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.5706793069839478, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8723202347755432, + "num_tokens": 109117287.0, + "step": 2854 + }, + { + "epoch": 0.36318534537590635, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.5768909454345703, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.851276159286499, + "num_tokens": 109159274.0, + "step": 2855 + }, + { + "epoch": 0.3633125556544969, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.5325878858566284, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8618984818458557, + "num_tokens": 109200959.0, + "step": 2856 + }, + { + "epoch": 0.3634397659330874, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.4399968385696411, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8690702319145203, + "num_tokens": 109247891.0, + "step": 2857 + }, + { + "epoch": 0.3635669762116779, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.6850848197937012, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8428342938423157, + "num_tokens": 109283209.0, + "step": 2858 + }, + { + "epoch": 0.3636941864902684, + "ewc_loss": 1.150369644165039e-05, + "grad_norm": 1.6941486597061157, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8348989486694336, + "num_tokens": 109321495.0, + "step": 2859 + }, + { + "epoch": 0.36382139676885894, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.793169379234314, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8581212759017944, + "num_tokens": 109359125.0, + "step": 2860 + }, + { + "epoch": 0.3639486070474494, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6386046409606934, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8676329851150513, + "num_tokens": 109395471.0, + "step": 2861 + }, + { + "epoch": 0.36407581732603994, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.5972621440887451, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8463347554206848, + "num_tokens": 109435031.0, + "step": 2862 + }, + { + "epoch": 0.36420302760463047, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.5253440141677856, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8653371334075928, + "num_tokens": 109472633.0, + "step": 2863 + }, + { + "epoch": 0.36433023788322094, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6902241706848145, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8500019311904907, + "num_tokens": 109508069.0, + "step": 2864 + }, + { + "epoch": 0.36445744816181147, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.5004069805145264, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8688378930091858, + "num_tokens": 109547495.0, + "step": 2865 + }, + { + "epoch": 0.364584658440402, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.660446286201477, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8484437465667725, + "num_tokens": 109585442.0, + "step": 2866 + }, + { + "epoch": 0.36471186871899247, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.732774257659912, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8769493103027344, + "num_tokens": 109616008.0, + "step": 2867 + }, + { + "epoch": 0.364839078997583, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6443392038345337, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8595478534698486, + "num_tokens": 109659592.0, + "step": 2868 + }, + { + "epoch": 0.36496628927617353, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.5893659591674805, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8598964214324951, + "num_tokens": 109698467.0, + "step": 2869 + }, + { + "epoch": 0.365093499554764, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.919001579284668, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8623803853988647, + "num_tokens": 109727439.0, + "step": 2870 + }, + { + "epoch": 0.36522070983335453, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6392343044281006, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8507683873176575, + "num_tokens": 109766157.0, + "step": 2871 + }, + { + "epoch": 0.36534792011194506, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.574271321296692, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8646057844161987, + "num_tokens": 109804397.0, + "step": 2872 + }, + { + "epoch": 0.36547513039053553, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.7303800582885742, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8592419028282166, + "num_tokens": 109841216.0, + "step": 2873 + }, + { + "epoch": 0.36560234066912606, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6228197813034058, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8551409244537354, + "num_tokens": 109878306.0, + "step": 2874 + }, + { + "epoch": 0.3657295509477166, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6012815237045288, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.837648332118988, + "num_tokens": 109923359.0, + "step": 2875 + }, + { + "epoch": 0.36585676122630706, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.5376338958740234, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8567136526107788, + "num_tokens": 109961762.0, + "step": 2876 + }, + { + "epoch": 0.3659839715048976, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6804518699645996, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.853608250617981, + "num_tokens": 109997890.0, + "step": 2877 + }, + { + "epoch": 0.3661111817834881, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.597174048423767, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8600439429283142, + "num_tokens": 110036344.0, + "step": 2878 + }, + { + "epoch": 0.3662383920620786, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.5254912376403809, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8646728992462158, + "num_tokens": 110075590.0, + "step": 2879 + }, + { + "epoch": 0.3663656023406691, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6661596298217773, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8681122660636902, + "num_tokens": 110109146.0, + "step": 2880 + }, + { + "epoch": 0.36649281261925964, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.5847105979919434, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8578057289123535, + "num_tokens": 110148590.0, + "step": 2881 + }, + { + "epoch": 0.3666200228978502, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.46531343460083, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8618757724761963, + "num_tokens": 110192904.0, + "step": 2882 + }, + { + "epoch": 0.36674723317644065, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.687329649925232, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8626635074615479, + "num_tokens": 110226189.0, + "step": 2883 + }, + { + "epoch": 0.3668744434550312, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.4380035400390625, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8747493028640747, + "num_tokens": 110268297.0, + "step": 2884 + }, + { + "epoch": 0.3670016537336217, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.699862003326416, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8582183718681335, + "num_tokens": 110305345.0, + "step": 2885 + }, + { + "epoch": 0.3671288640122122, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6497421264648438, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8360482454299927, + "num_tokens": 110346149.0, + "step": 2886 + }, + { + "epoch": 0.3672560742908027, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.671096920967102, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8630285263061523, + "num_tokens": 110386654.0, + "step": 2887 + }, + { + "epoch": 0.36738328456939323, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.5646263360977173, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8550882935523987, + "num_tokens": 110426521.0, + "step": 2888 + }, + { + "epoch": 0.3675104948479837, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.541944146156311, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8646037578582764, + "num_tokens": 110466147.0, + "step": 2889 + }, + { + "epoch": 0.36763770512657423, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.6312216520309448, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8579431772232056, + "num_tokens": 110508557.0, + "step": 2890 + }, + { + "epoch": 0.36776491540516476, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.472747564315796, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.860161542892456, + "num_tokens": 110553067.0, + "step": 2891 + }, + { + "epoch": 0.36789212568375523, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.541567087173462, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8366882801055908, + "num_tokens": 110599503.0, + "step": 2892 + }, + { + "epoch": 0.36801933596234576, + "ewc_loss": 1.1563301086425781e-05, + "grad_norm": 1.6356823444366455, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8419094681739807, + "num_tokens": 110637099.0, + "step": 2893 + }, + { + "epoch": 0.3681465462409363, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.8154462575912476, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8515909910202026, + "num_tokens": 110667054.0, + "step": 2894 + }, + { + "epoch": 0.36827375651952676, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.5689830780029297, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8676267862319946, + "num_tokens": 110703755.0, + "step": 2895 + }, + { + "epoch": 0.3684009667981173, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.601503849029541, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8594115376472473, + "num_tokens": 110741417.0, + "step": 2896 + }, + { + "epoch": 0.3685281770767078, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.7340142726898193, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8497593402862549, + "num_tokens": 110776269.0, + "step": 2897 + }, + { + "epoch": 0.3686553873552983, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.6405467987060547, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8416203856468201, + "num_tokens": 110819740.0, + "step": 2898 + }, + { + "epoch": 0.3687825976338888, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.6463382244110107, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8554465770721436, + "num_tokens": 110856989.0, + "step": 2899 + }, + { + "epoch": 0.36890980791247935, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.5787944793701172, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8680091500282288, + "num_tokens": 110894934.0, + "step": 2900 + }, + { + "epoch": 0.3690370181910698, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.5409343242645264, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8666346073150635, + "num_tokens": 110933728.0, + "step": 2901 + }, + { + "epoch": 0.36916422846966035, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.5251953601837158, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8534538149833679, + "num_tokens": 110972435.0, + "step": 2902 + }, + { + "epoch": 0.3692914387482509, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.477316975593567, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8642843961715698, + "num_tokens": 111012241.0, + "step": 2903 + }, + { + "epoch": 0.36941864902684135, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.5501986742019653, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8597924113273621, + "num_tokens": 111052140.0, + "step": 2904 + }, + { + "epoch": 0.3695458593054319, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.7438702583312988, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8631724119186401, + "num_tokens": 111083654.0, + "step": 2905 + }, + { + "epoch": 0.3696730695840224, + "ewc_loss": 1.1622905731201172e-05, + "grad_norm": 1.6709436178207397, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.842970609664917, + "num_tokens": 111122573.0, + "step": 2906 + }, + { + "epoch": 0.3698002798626129, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.6460574865341187, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.872406005859375, + "num_tokens": 111157230.0, + "step": 2907 + }, + { + "epoch": 0.3699274901412034, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.6702094078063965, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8522099852561951, + "num_tokens": 111192806.0, + "step": 2908 + }, + { + "epoch": 0.37005470041979394, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.6174818277359009, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8634251356124878, + "num_tokens": 111231074.0, + "step": 2909 + }, + { + "epoch": 0.3701819106983844, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.7163076400756836, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8477602005004883, + "num_tokens": 111267329.0, + "step": 2910 + }, + { + "epoch": 0.37030912097697494, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.6216216087341309, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8595733642578125, + "num_tokens": 111307285.0, + "step": 2911 + }, + { + "epoch": 0.37043633125556547, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.7736239433288574, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8392282128334045, + "num_tokens": 111344141.0, + "step": 2912 + }, + { + "epoch": 0.37056354153415594, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.5376344919204712, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8564255237579346, + "num_tokens": 111384965.0, + "step": 2913 + }, + { + "epoch": 0.37069075181274647, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.698695182800293, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.859674870967865, + "num_tokens": 111418620.0, + "step": 2914 + }, + { + "epoch": 0.370817962091337, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.5842536687850952, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8759504556655884, + "num_tokens": 111455131.0, + "step": 2915 + }, + { + "epoch": 0.37094517236992747, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.534814715385437, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8612128496170044, + "num_tokens": 111495832.0, + "step": 2916 + }, + { + "epoch": 0.371072382648518, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.5017125606536865, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8477784395217896, + "num_tokens": 111540416.0, + "step": 2917 + }, + { + "epoch": 0.3711995929271085, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.6938260793685913, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8564539551734924, + "num_tokens": 111575403.0, + "step": 2918 + }, + { + "epoch": 0.371326803205699, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.6093083620071411, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8598854541778564, + "num_tokens": 111613327.0, + "step": 2919 + }, + { + "epoch": 0.3714540134842895, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.5959841012954712, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8581525087356567, + "num_tokens": 111652136.0, + "step": 2920 + }, + { + "epoch": 0.37158122376288005, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.6088392734527588, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8711643218994141, + "num_tokens": 111686115.0, + "step": 2921 + }, + { + "epoch": 0.3717084340414705, + "ewc_loss": 1.1682510375976562e-05, + "grad_norm": 1.5626088380813599, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8667867183685303, + "num_tokens": 111724982.0, + "step": 2922 + }, + { + "epoch": 0.37183564432006105, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6292837858200073, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8571500778198242, + "num_tokens": 111762551.0, + "step": 2923 + }, + { + "epoch": 0.3719628545986516, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6045771837234497, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.86260586977005, + "num_tokens": 111804041.0, + "step": 2924 + }, + { + "epoch": 0.37209006487724205, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6581331491470337, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8576573133468628, + "num_tokens": 111843379.0, + "step": 2925 + }, + { + "epoch": 0.3722172751558326, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6521841287612915, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8722126483917236, + "num_tokens": 111878430.0, + "step": 2926 + }, + { + "epoch": 0.3723444854344231, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5086119174957275, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8593066930770874, + "num_tokens": 111919274.0, + "step": 2927 + }, + { + "epoch": 0.3724716957130136, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6502456665039062, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8635593056678772, + "num_tokens": 111954865.0, + "step": 2928 + }, + { + "epoch": 0.3725989059916041, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.7253830432891846, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8651888370513916, + "num_tokens": 111988802.0, + "step": 2929 + }, + { + "epoch": 0.37272611627019464, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.54213285446167, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8572845458984375, + "num_tokens": 112028268.0, + "step": 2930 + }, + { + "epoch": 0.37285332654878517, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5671653747558594, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8599337339401245, + "num_tokens": 112065951.0, + "step": 2931 + }, + { + "epoch": 0.37298053682737564, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5884249210357666, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8536073565483093, + "num_tokens": 112103390.0, + "step": 2932 + }, + { + "epoch": 0.37310774710596617, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5773402452468872, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8710052371025085, + "num_tokens": 112139518.0, + "step": 2933 + }, + { + "epoch": 0.3732349573845567, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6030184030532837, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8737753629684448, + "num_tokens": 112173799.0, + "step": 2934 + }, + { + "epoch": 0.37336216766314717, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.492437720298767, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.865552544593811, + "num_tokens": 112214464.0, + "step": 2935 + }, + { + "epoch": 0.3734893779417377, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.479756236076355, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8612920045852661, + "num_tokens": 112259543.0, + "step": 2936 + }, + { + "epoch": 0.3736165882203282, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.50043785572052, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8636002540588379, + "num_tokens": 112301321.0, + "step": 2937 + }, + { + "epoch": 0.3737437984989187, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.650056004524231, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8449342250823975, + "num_tokens": 112340928.0, + "step": 2938 + }, + { + "epoch": 0.37387100877750923, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.7742218971252441, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8604801297187805, + "num_tokens": 112375643.0, + "step": 2939 + }, + { + "epoch": 0.37399821905609976, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.7342746257781982, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8533283472061157, + "num_tokens": 112413155.0, + "step": 2940 + }, + { + "epoch": 0.37412542933469023, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.585223913192749, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8584689497947693, + "num_tokens": 112452962.0, + "step": 2941 + }, + { + "epoch": 0.37425263961328076, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6429667472839355, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8579058647155762, + "num_tokens": 112490938.0, + "step": 2942 + }, + { + "epoch": 0.3743798498918713, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.7532674074172974, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8374799489974976, + "num_tokens": 112529045.0, + "step": 2943 + }, + { + "epoch": 0.37450706017046176, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5833371877670288, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8586024045944214, + "num_tokens": 112569318.0, + "step": 2944 + }, + { + "epoch": 0.3746342704490523, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6723947525024414, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8667572140693665, + "num_tokens": 112603009.0, + "step": 2945 + }, + { + "epoch": 0.3747614807276428, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.650356411933899, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8561679720878601, + "num_tokens": 112639464.0, + "step": 2946 + }, + { + "epoch": 0.3748886910062333, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5317094326019287, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8679823279380798, + "num_tokens": 112680877.0, + "step": 2947 + }, + { + "epoch": 0.3750159012848238, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.7014529705047607, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8654084801673889, + "num_tokens": 112712503.0, + "step": 2948 + }, + { + "epoch": 0.37514311156341434, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6999439001083374, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.862669050693512, + "num_tokens": 112746302.0, + "step": 2949 + }, + { + "epoch": 0.3752703218420048, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5238369703292847, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.858087420463562, + "num_tokens": 112790083.0, + "step": 2950 + }, + { + "epoch": 0.37539753212059535, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5562593936920166, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8726550340652466, + "num_tokens": 112826388.0, + "step": 2951 + }, + { + "epoch": 0.3755247423991859, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5718785524368286, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8536190390586853, + "num_tokens": 112867245.0, + "step": 2952 + }, + { + "epoch": 0.37565195267777635, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5259246826171875, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8567038178443909, + "num_tokens": 112913164.0, + "step": 2953 + }, + { + "epoch": 0.3757791629563669, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.4944405555725098, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8810721635818481, + "num_tokens": 112950572.0, + "step": 2954 + }, + { + "epoch": 0.3759063732349574, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6540290117263794, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8620957136154175, + "num_tokens": 112984883.0, + "step": 2955 + }, + { + "epoch": 0.3760335835135479, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.5841008424758911, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8644270896911621, + "num_tokens": 113022995.0, + "step": 2956 + }, + { + "epoch": 0.3761607937921384, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6551467180252075, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8587190508842468, + "num_tokens": 113063840.0, + "step": 2957 + }, + { + "epoch": 0.37628800407072893, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.7506544589996338, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8474912047386169, + "num_tokens": 113102358.0, + "step": 2958 + }, + { + "epoch": 0.3764152143493194, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.8027678728103638, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8447717428207397, + "num_tokens": 113142117.0, + "step": 2959 + }, + { + "epoch": 0.37654242462790993, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.6524182558059692, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8504254221916199, + "num_tokens": 113181143.0, + "step": 2960 + }, + { + "epoch": 0.37666963490650046, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.738293170928955, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8541364669799805, + "num_tokens": 113215344.0, + "step": 2961 + }, + { + "epoch": 0.37679684518509093, + "ewc_loss": 1.1801719665527344e-05, + "grad_norm": 1.537481665611267, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8528329133987427, + "num_tokens": 113261471.0, + "step": 2962 + }, + { + "epoch": 0.37692405546368146, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.7282836437225342, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8615322113037109, + "num_tokens": 113300517.0, + "step": 2963 + }, + { + "epoch": 0.377051265742272, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6320356130599976, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8533206582069397, + "num_tokens": 113340915.0, + "step": 2964 + }, + { + "epoch": 0.37717847602086246, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.5745437145233154, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8571054935455322, + "num_tokens": 113384308.0, + "step": 2965 + }, + { + "epoch": 0.377305686299453, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.797778844833374, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8393017053604126, + "num_tokens": 113421059.0, + "step": 2966 + }, + { + "epoch": 0.3774328965780435, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.7751266956329346, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8650254011154175, + "num_tokens": 113458024.0, + "step": 2967 + }, + { + "epoch": 0.377560106856634, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.5363701581954956, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8762186765670776, + "num_tokens": 113495761.0, + "step": 2968 + }, + { + "epoch": 0.3776873171352245, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.5817701816558838, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8564566373825073, + "num_tokens": 113538372.0, + "step": 2969 + }, + { + "epoch": 0.37781452741381505, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.829105257987976, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8676165342330933, + "num_tokens": 113572100.0, + "step": 2970 + }, + { + "epoch": 0.3779417376924055, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.8420135974884033, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8455191850662231, + "num_tokens": 113606244.0, + "step": 2971 + }, + { + "epoch": 0.37806894797099605, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6231582164764404, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8699254393577576, + "num_tokens": 113642608.0, + "step": 2972 + }, + { + "epoch": 0.3781961582495866, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.679287075996399, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.85462486743927, + "num_tokens": 113678479.0, + "step": 2973 + }, + { + "epoch": 0.37832336852817705, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.7480219602584839, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8328682780265808, + "num_tokens": 113711836.0, + "step": 2974 + }, + { + "epoch": 0.3784505788067676, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6691992282867432, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8422343730926514, + "num_tokens": 113748233.0, + "step": 2975 + }, + { + "epoch": 0.3785777890853581, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6192824840545654, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8515067100524902, + "num_tokens": 113787735.0, + "step": 2976 + }, + { + "epoch": 0.3787049993639486, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.5679293870925903, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8645250797271729, + "num_tokens": 113827518.0, + "step": 2977 + }, + { + "epoch": 0.3788322096425391, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.5492560863494873, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8663238286972046, + "num_tokens": 113869379.0, + "step": 2978 + }, + { + "epoch": 0.37895941992112964, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6400995254516602, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8576832413673401, + "num_tokens": 113904586.0, + "step": 2979 + }, + { + "epoch": 0.37908663019972016, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.5185950994491577, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.878759503364563, + "num_tokens": 113944512.0, + "step": 2980 + }, + { + "epoch": 0.37921384047831064, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.8661490678787231, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8633086681365967, + "num_tokens": 113979674.0, + "step": 2981 + }, + { + "epoch": 0.37934105075690117, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.81526780128479, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8685543537139893, + "num_tokens": 114011936.0, + "step": 2982 + }, + { + "epoch": 0.3794682610354917, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6574040651321411, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8544001579284668, + "num_tokens": 114045553.0, + "step": 2983 + }, + { + "epoch": 0.37959547131408217, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.7739259004592896, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8500620722770691, + "num_tokens": 114077529.0, + "step": 2984 + }, + { + "epoch": 0.3797226815926727, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6218760013580322, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8668121695518494, + "num_tokens": 114116190.0, + "step": 2985 + }, + { + "epoch": 0.3798498918712632, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6172126531600952, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8541611433029175, + "num_tokens": 114154835.0, + "step": 2986 + }, + { + "epoch": 0.3799771021498537, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.7037094831466675, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8466060757637024, + "num_tokens": 114194161.0, + "step": 2987 + }, + { + "epoch": 0.3801043124284442, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.903734803199768, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8467596769332886, + "num_tokens": 114222639.0, + "step": 2988 + }, + { + "epoch": 0.38023152270703475, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.4539034366607666, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8638200759887695, + "num_tokens": 114268010.0, + "step": 2989 + }, + { + "epoch": 0.3803587329856252, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.5514979362487793, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8482444882392883, + "num_tokens": 114308326.0, + "step": 2990 + }, + { + "epoch": 0.38048594326421575, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6239639520645142, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8620585203170776, + "num_tokens": 114343653.0, + "step": 2991 + }, + { + "epoch": 0.3806131535428063, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.7182481288909912, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8551696538925171, + "num_tokens": 114378907.0, + "step": 2992 + }, + { + "epoch": 0.38074036382139675, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.546751618385315, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8558571934700012, + "num_tokens": 114415185.0, + "step": 2993 + }, + { + "epoch": 0.3808675740999873, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.750475287437439, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8303679823875427, + "num_tokens": 114452469.0, + "step": 2994 + }, + { + "epoch": 0.3809947843785778, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.4770928621292114, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8605303764343262, + "num_tokens": 114493710.0, + "step": 2995 + }, + { + "epoch": 0.3811219946571683, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.5487264394760132, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8599981665611267, + "num_tokens": 114535619.0, + "step": 2996 + }, + { + "epoch": 0.3812492049357588, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 1.4725327491760254, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8638778924942017, + "num_tokens": 114578863.0, + "step": 2997 + }, + { + "epoch": 0.38137641521434934, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.4759910106658936, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8685712814331055, + "num_tokens": 114617800.0, + "step": 2998 + }, + { + "epoch": 0.3815036254929398, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 1.6518628597259521, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8417110443115234, + "num_tokens": 114659106.0, + "step": 2999 + }, + { + "epoch": 0.38163083577153034, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6227082014083862, + "learning_rate": 1e-06, + "loss": 0.5407, + "mean_token_accuracy": 0.8217678070068359, + "num_tokens": 114700159.0, + "step": 3000 + }, + { + "epoch": 0.38175804605012087, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6813780069351196, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8672299385070801, + "num_tokens": 114732181.0, + "step": 3001 + }, + { + "epoch": 0.38188525632871134, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.600371241569519, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8688083291053772, + "num_tokens": 114764967.0, + "step": 3002 + }, + { + "epoch": 0.38201246660730187, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6610023975372314, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8592209815979004, + "num_tokens": 114805730.0, + "step": 3003 + }, + { + "epoch": 0.3821396768858924, + "ewc_loss": 1.1861324310302734e-05, + "grad_norm": 1.6027222871780396, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8641344308853149, + "num_tokens": 114845554.0, + "step": 3004 + }, + { + "epoch": 0.38226688716448287, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 1.6402472257614136, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8609554171562195, + "num_tokens": 114880824.0, + "step": 3005 + }, + { + "epoch": 0.3823940974430734, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 1.7241021394729614, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8630313277244568, + "num_tokens": 114913054.0, + "step": 3006 + }, + { + "epoch": 0.38252130772166393, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 1.539739727973938, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.863571047782898, + "num_tokens": 114954629.0, + "step": 3007 + }, + { + "epoch": 0.3826485180002544, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 1.6813480854034424, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8465312719345093, + "num_tokens": 114991012.0, + "step": 3008 + }, + { + "epoch": 0.38277572827884493, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 1.5583778619766235, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8651965260505676, + "num_tokens": 115030145.0, + "step": 3009 + }, + { + "epoch": 0.38290293855743546, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 1.6421353816986084, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8561444282531738, + "num_tokens": 115071862.0, + "step": 3010 + }, + { + "epoch": 0.38303014883602593, + "ewc_loss": 1.1980533599853516e-05, + "grad_norm": 1.4283555746078491, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8840054273605347, + "num_tokens": 115110722.0, + "step": 3011 + }, + { + "epoch": 0.38315735911461646, + "ewc_loss": 1.1920928955078125e-05, + "grad_norm": 1.5294109582901, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8475247025489807, + "num_tokens": 115156607.0, + "step": 3012 + }, + { + "epoch": 0.383284569393207, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.7108697891235352, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8563275933265686, + "num_tokens": 115191190.0, + "step": 3013 + }, + { + "epoch": 0.38341177967179746, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5253264904022217, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8556960821151733, + "num_tokens": 115238043.0, + "step": 3014 + }, + { + "epoch": 0.383538989950388, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.508434772491455, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8703611493110657, + "num_tokens": 115277815.0, + "step": 3015 + }, + { + "epoch": 0.3836662002289785, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.567386507987976, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8614373803138733, + "num_tokens": 115317506.0, + "step": 3016 + }, + { + "epoch": 0.383793410507569, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.8279485702514648, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8554672002792358, + "num_tokens": 115347468.0, + "step": 3017 + }, + { + "epoch": 0.3839206207861595, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5614866018295288, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.868163526058197, + "num_tokens": 115383281.0, + "step": 3018 + }, + { + "epoch": 0.38404783106475004, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.4612782001495361, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8617879748344421, + "num_tokens": 115426374.0, + "step": 3019 + }, + { + "epoch": 0.3841750413433405, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.3932000398635864, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8696430325508118, + "num_tokens": 115469091.0, + "step": 3020 + }, + { + "epoch": 0.38430225162193105, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.555779218673706, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8713929653167725, + "num_tokens": 115506823.0, + "step": 3021 + }, + { + "epoch": 0.3844294619005216, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5022331476211548, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8719867467880249, + "num_tokens": 115542364.0, + "step": 3022 + }, + { + "epoch": 0.38455667217911205, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5249099731445312, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8563567399978638, + "num_tokens": 115580880.0, + "step": 3023 + }, + { + "epoch": 0.3846838824577026, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5435001850128174, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8725374341011047, + "num_tokens": 115619347.0, + "step": 3024 + }, + { + "epoch": 0.3848110927362931, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5573484897613525, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8600514531135559, + "num_tokens": 115659968.0, + "step": 3025 + }, + { + "epoch": 0.3849383030148836, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5427844524383545, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8579446077346802, + "num_tokens": 115698713.0, + "step": 3026 + }, + { + "epoch": 0.3850655132934741, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5644493103027344, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8608973026275635, + "num_tokens": 115738322.0, + "step": 3027 + }, + { + "epoch": 0.38519272357206463, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.6239254474639893, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8454791307449341, + "num_tokens": 115777119.0, + "step": 3028 + }, + { + "epoch": 0.3853199338506551, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5838207006454468, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8746089935302734, + "num_tokens": 115812648.0, + "step": 3029 + }, + { + "epoch": 0.38544714412924563, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.7505677938461304, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8308303952217102, + "num_tokens": 115850687.0, + "step": 3030 + }, + { + "epoch": 0.38557435440783616, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5057734251022339, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8616576194763184, + "num_tokens": 115892078.0, + "step": 3031 + }, + { + "epoch": 0.3857015646864267, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5150798559188843, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.874748706817627, + "num_tokens": 115931216.0, + "step": 3032 + }, + { + "epoch": 0.38582877496501716, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.4843684434890747, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8526472449302673, + "num_tokens": 115974078.0, + "step": 3033 + }, + { + "epoch": 0.3859559852436077, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5996263027191162, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8618693351745605, + "num_tokens": 116010530.0, + "step": 3034 + }, + { + "epoch": 0.3860831955221982, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.6708927154541016, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8412477374076843, + "num_tokens": 116048887.0, + "step": 3035 + }, + { + "epoch": 0.3862104058007887, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5008165836334229, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8694546818733215, + "num_tokens": 116090757.0, + "step": 3036 + }, + { + "epoch": 0.3863376160793792, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5808085203170776, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8692494034767151, + "num_tokens": 116128090.0, + "step": 3037 + }, + { + "epoch": 0.38646482635796975, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.5588527917861938, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8479127287864685, + "num_tokens": 116169362.0, + "step": 3038 + }, + { + "epoch": 0.3865920366365602, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.8094041347503662, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.850357174873352, + "num_tokens": 116200776.0, + "step": 3039 + }, + { + "epoch": 0.38671924691515075, + "ewc_loss": 1.2040138244628906e-05, + "grad_norm": 1.6733365058898926, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.866834282875061, + "num_tokens": 116236273.0, + "step": 3040 + }, + { + "epoch": 0.3868464571937413, + "ewc_loss": 1.2099742889404297e-05, + "grad_norm": 1.7410342693328857, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8515981435775757, + "num_tokens": 116268942.0, + "step": 3041 + }, + { + "epoch": 0.38697366747233175, + "ewc_loss": 1.2099742889404297e-05, + "grad_norm": 1.7592145204544067, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8462039232254028, + "num_tokens": 116304904.0, + "step": 3042 + }, + { + "epoch": 0.3871008777509223, + "ewc_loss": 1.2099742889404297e-05, + "grad_norm": 1.6262118816375732, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8640079498291016, + "num_tokens": 116341723.0, + "step": 3043 + }, + { + "epoch": 0.3872280880295128, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.7158950567245483, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8552687168121338, + "num_tokens": 116377371.0, + "step": 3044 + }, + { + "epoch": 0.3873552983081033, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.6282657384872437, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8524267673492432, + "num_tokens": 116418901.0, + "step": 3045 + }, + { + "epoch": 0.3874825085866938, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.6442643404006958, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.856735348701477, + "num_tokens": 116456292.0, + "step": 3046 + }, + { + "epoch": 0.38760971886528434, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.663185715675354, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8542013168334961, + "num_tokens": 116492683.0, + "step": 3047 + }, + { + "epoch": 0.3877369291438748, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.6609553098678589, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8586665987968445, + "num_tokens": 116526672.0, + "step": 3048 + }, + { + "epoch": 0.38786413942246534, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.7597410678863525, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8408184051513672, + "num_tokens": 116559357.0, + "step": 3049 + }, + { + "epoch": 0.38799134970105587, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.6380219459533691, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8448752164840698, + "num_tokens": 116598605.0, + "step": 3050 + }, + { + "epoch": 0.38811855997964634, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.8193738460540771, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8471670150756836, + "num_tokens": 116634218.0, + "step": 3051 + }, + { + "epoch": 0.38824577025823687, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.4890966415405273, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8594185709953308, + "num_tokens": 116675591.0, + "step": 3052 + }, + { + "epoch": 0.3883729805368274, + "ewc_loss": 1.2159347534179688e-05, + "grad_norm": 1.594942331314087, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8588962554931641, + "num_tokens": 116711390.0, + "step": 3053 + }, + { + "epoch": 0.38850019081541787, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.7549784183502197, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.867433488368988, + "num_tokens": 116744271.0, + "step": 3054 + }, + { + "epoch": 0.3886274010940084, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6059303283691406, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8605586290359497, + "num_tokens": 116782318.0, + "step": 3055 + }, + { + "epoch": 0.3887546113725989, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6629488468170166, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.838519275188446, + "num_tokens": 116823960.0, + "step": 3056 + }, + { + "epoch": 0.3888818216511894, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.677733063697815, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8597427010536194, + "num_tokens": 116858676.0, + "step": 3057 + }, + { + "epoch": 0.3890090319297799, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.64602530002594, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8617287278175354, + "num_tokens": 116893197.0, + "step": 3058 + }, + { + "epoch": 0.38913624220837045, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5079518556594849, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8727089166641235, + "num_tokens": 116935849.0, + "step": 3059 + }, + { + "epoch": 0.3892634524869609, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6189523935317993, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8574458956718445, + "num_tokens": 116974097.0, + "step": 3060 + }, + { + "epoch": 0.38939066276555145, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6432808637619019, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8546403050422668, + "num_tokens": 117010674.0, + "step": 3061 + }, + { + "epoch": 0.389517873044142, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.581770896911621, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.870192289352417, + "num_tokens": 117045695.0, + "step": 3062 + }, + { + "epoch": 0.38964508332273246, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5626636743545532, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8687067031860352, + "num_tokens": 117081893.0, + "step": 3063 + }, + { + "epoch": 0.389772293601323, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5324698686599731, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8635950088500977, + "num_tokens": 117122593.0, + "step": 3064 + }, + { + "epoch": 0.3898995038799135, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.59036123752594, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8697769641876221, + "num_tokens": 117156211.0, + "step": 3065 + }, + { + "epoch": 0.390026714158504, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.4936500787734985, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8600214719772339, + "num_tokens": 117198110.0, + "step": 3066 + }, + { + "epoch": 0.3901539244370945, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5712032318115234, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8717660307884216, + "num_tokens": 117234728.0, + "step": 3067 + }, + { + "epoch": 0.39028113471568504, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6249492168426514, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8576999306678772, + "num_tokens": 117273883.0, + "step": 3068 + }, + { + "epoch": 0.3904083449942755, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.7870218753814697, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8416420221328735, + "num_tokens": 117308738.0, + "step": 3069 + }, + { + "epoch": 0.39053555527286604, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6928913593292236, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8651912212371826, + "num_tokens": 117345058.0, + "step": 3070 + }, + { + "epoch": 0.39066276555145657, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6030709743499756, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8501232862472534, + "num_tokens": 117383980.0, + "step": 3071 + }, + { + "epoch": 0.39078997583004704, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.731155514717102, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.857528805732727, + "num_tokens": 117422006.0, + "step": 3072 + }, + { + "epoch": 0.39091718610863757, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5510749816894531, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8551880121231079, + "num_tokens": 117460072.0, + "step": 3073 + }, + { + "epoch": 0.3910443963872281, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.655688762664795, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8590147495269775, + "num_tokens": 117495638.0, + "step": 3074 + }, + { + "epoch": 0.39117160666581857, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.7157858610153198, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8321337699890137, + "num_tokens": 117531493.0, + "step": 3075 + }, + { + "epoch": 0.3912988169444091, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5968983173370361, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8498972654342651, + "num_tokens": 117572790.0, + "step": 3076 + }, + { + "epoch": 0.39142602722299963, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5372302532196045, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8585095405578613, + "num_tokens": 117613136.0, + "step": 3077 + }, + { + "epoch": 0.3915532375015901, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.543636441230774, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8605917692184448, + "num_tokens": 117655490.0, + "step": 3078 + }, + { + "epoch": 0.39168044778018063, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.7560911178588867, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8469377756118774, + "num_tokens": 117690183.0, + "step": 3079 + }, + { + "epoch": 0.39180765805877116, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.535943865776062, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8590020537376404, + "num_tokens": 117725440.0, + "step": 3080 + }, + { + "epoch": 0.3919348683373617, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.575203537940979, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8525415658950806, + "num_tokens": 117763849.0, + "step": 3081 + }, + { + "epoch": 0.39206207861595216, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6877624988555908, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8402932286262512, + "num_tokens": 117804695.0, + "step": 3082 + }, + { + "epoch": 0.3921892888945427, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6369109153747559, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.846386194229126, + "num_tokens": 117845866.0, + "step": 3083 + }, + { + "epoch": 0.3923164991731332, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6103384494781494, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8482791185379028, + "num_tokens": 117885528.0, + "step": 3084 + }, + { + "epoch": 0.3924437094517237, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6308295726776123, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8462069034576416, + "num_tokens": 117924377.0, + "step": 3085 + }, + { + "epoch": 0.3925709197303142, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6763187646865845, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8645051717758179, + "num_tokens": 117959378.0, + "step": 3086 + }, + { + "epoch": 0.39269813000890474, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.497177243232727, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8507023453712463, + "num_tokens": 118003331.0, + "step": 3087 + }, + { + "epoch": 0.3928253402874952, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.632411003112793, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8552196025848389, + "num_tokens": 118044534.0, + "step": 3088 + }, + { + "epoch": 0.39295255056608575, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.722872257232666, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8596393465995789, + "num_tokens": 118080513.0, + "step": 3089 + }, + { + "epoch": 0.3930797608446763, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5346444845199585, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8509272336959839, + "num_tokens": 118123042.0, + "step": 3090 + }, + { + "epoch": 0.39320697112326675, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5693031549453735, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.860336422920227, + "num_tokens": 118162262.0, + "step": 3091 + }, + { + "epoch": 0.3933341814018573, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.4139961004257202, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8569315671920776, + "num_tokens": 118208661.0, + "step": 3092 + }, + { + "epoch": 0.3934613916804478, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.590601921081543, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8494765162467957, + "num_tokens": 118250858.0, + "step": 3093 + }, + { + "epoch": 0.3935886019590383, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5517723560333252, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.880388081073761, + "num_tokens": 118285163.0, + "step": 3094 + }, + { + "epoch": 0.3937158122376288, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5370324850082397, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.861038088798523, + "num_tokens": 118323568.0, + "step": 3095 + }, + { + "epoch": 0.39384302251621933, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6221047639846802, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8666795492172241, + "num_tokens": 118358835.0, + "step": 3096 + }, + { + "epoch": 0.3939702327948098, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5165984630584717, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8627271056175232, + "num_tokens": 118397872.0, + "step": 3097 + }, + { + "epoch": 0.39409744307340033, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.547158122062683, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8662346601486206, + "num_tokens": 118437805.0, + "step": 3098 + }, + { + "epoch": 0.39422465335199086, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5777961015701294, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.859899640083313, + "num_tokens": 118478478.0, + "step": 3099 + }, + { + "epoch": 0.39435186363058133, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.578958511352539, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8551019430160522, + "num_tokens": 118517688.0, + "step": 3100 + }, + { + "epoch": 0.39447907390917186, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5894801616668701, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8483878374099731, + "num_tokens": 118560055.0, + "step": 3101 + }, + { + "epoch": 0.3946062841877624, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5951145887374878, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8584437370300293, + "num_tokens": 118604703.0, + "step": 3102 + }, + { + "epoch": 0.39473349446635286, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.8474220037460327, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8531041741371155, + "num_tokens": 118638347.0, + "step": 3103 + }, + { + "epoch": 0.3948607047449434, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.6546512842178345, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8565385341644287, + "num_tokens": 118676290.0, + "step": 3104 + }, + { + "epoch": 0.3949879150235339, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.7392117977142334, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8585171699523926, + "num_tokens": 118710251.0, + "step": 3105 + }, + { + "epoch": 0.3951151253021244, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.831941843032837, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8551434278488159, + "num_tokens": 118739514.0, + "step": 3106 + }, + { + "epoch": 0.3952423355807149, + "ewc_loss": 1.2278556823730469e-05, + "grad_norm": 1.5809413194656372, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8443107008934021, + "num_tokens": 118777532.0, + "step": 3107 + }, + { + "epoch": 0.39536954585930545, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.675765872001648, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8636199235916138, + "num_tokens": 118814321.0, + "step": 3108 + }, + { + "epoch": 0.3954967561378959, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.6329731941223145, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8518789410591125, + "num_tokens": 118853071.0, + "step": 3109 + }, + { + "epoch": 0.39562396641648645, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.7972079515457153, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.86113440990448, + "num_tokens": 118883466.0, + "step": 3110 + }, + { + "epoch": 0.395751176695077, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.6330468654632568, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8391041159629822, + "num_tokens": 118921910.0, + "step": 3111 + }, + { + "epoch": 0.39587838697366745, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.6148759126663208, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8738840818405151, + "num_tokens": 118957049.0, + "step": 3112 + }, + { + "epoch": 0.396005597252258, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.6683298349380493, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8675131797790527, + "num_tokens": 118990305.0, + "step": 3113 + }, + { + "epoch": 0.3961328075308485, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.7732338905334473, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8452580571174622, + "num_tokens": 119024196.0, + "step": 3114 + }, + { + "epoch": 0.396260017809439, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.7420655488967896, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8499273657798767, + "num_tokens": 119055962.0, + "step": 3115 + }, + { + "epoch": 0.3963872280880295, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.630739688873291, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8626186847686768, + "num_tokens": 119091484.0, + "step": 3116 + }, + { + "epoch": 0.39651443836662004, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.7920013666152954, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8506518602371216, + "num_tokens": 119130178.0, + "step": 3117 + }, + { + "epoch": 0.3966416486452105, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.598127007484436, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8498278260231018, + "num_tokens": 119166968.0, + "step": 3118 + }, + { + "epoch": 0.39676885892380104, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.7133300304412842, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8523451089859009, + "num_tokens": 119202654.0, + "step": 3119 + }, + { + "epoch": 0.39689606920239157, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.6102312803268433, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8576976656913757, + "num_tokens": 119241611.0, + "step": 3120 + }, + { + "epoch": 0.39702327948098204, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.5643174648284912, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8604913949966431, + "num_tokens": 119284635.0, + "step": 3121 + }, + { + "epoch": 0.39715048975957257, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.6856240034103394, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8647050857543945, + "num_tokens": 119319746.0, + "step": 3122 + }, + { + "epoch": 0.3972777000381631, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.5081650018692017, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8743692636489868, + "num_tokens": 119357954.0, + "step": 3123 + }, + { + "epoch": 0.39740491031675357, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.7077685594558716, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8659941554069519, + "num_tokens": 119388616.0, + "step": 3124 + }, + { + "epoch": 0.3975321205953441, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.5953751802444458, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8707447648048401, + "num_tokens": 119426009.0, + "step": 3125 + }, + { + "epoch": 0.3976593308739346, + "ewc_loss": 1.233816146850586e-05, + "grad_norm": 1.6761953830718994, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8418688178062439, + "num_tokens": 119464713.0, + "step": 3126 + }, + { + "epoch": 0.3977865411525251, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.6607714891433716, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8587673306465149, + "num_tokens": 119500453.0, + "step": 3127 + }, + { + "epoch": 0.3979137514311156, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.3921507596969604, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8655312061309814, + "num_tokens": 119544067.0, + "step": 3128 + }, + { + "epoch": 0.39804096170970615, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.7749863862991333, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8529752492904663, + "num_tokens": 119576624.0, + "step": 3129 + }, + { + "epoch": 0.3981681719882967, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.6686128377914429, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8670030832290649, + "num_tokens": 119612441.0, + "step": 3130 + }, + { + "epoch": 0.39829538226688715, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.7070367336273193, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8503609299659729, + "num_tokens": 119648210.0, + "step": 3131 + }, + { + "epoch": 0.3984225925454777, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5905872583389282, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8543862700462341, + "num_tokens": 119689474.0, + "step": 3132 + }, + { + "epoch": 0.3985498028240682, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5533440113067627, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8670681715011597, + "num_tokens": 119730536.0, + "step": 3133 + }, + { + "epoch": 0.3986770131026587, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.653313159942627, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8652364015579224, + "num_tokens": 119764139.0, + "step": 3134 + }, + { + "epoch": 0.3988042233812492, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.584321141242981, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8653410077095032, + "num_tokens": 119799215.0, + "step": 3135 + }, + { + "epoch": 0.39893143365983974, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.587996244430542, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8621896505355835, + "num_tokens": 119838185.0, + "step": 3136 + }, + { + "epoch": 0.3990586439384302, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5163052082061768, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8663702011108398, + "num_tokens": 119877397.0, + "step": 3137 + }, + { + "epoch": 0.39918585421702074, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5922983884811401, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8628031015396118, + "num_tokens": 119914952.0, + "step": 3138 + }, + { + "epoch": 0.39931306449561127, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.7694721221923828, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8551089763641357, + "num_tokens": 119946501.0, + "step": 3139 + }, + { + "epoch": 0.39944027477420174, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.6985020637512207, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8483312129974365, + "num_tokens": 119978008.0, + "step": 3140 + }, + { + "epoch": 0.39956748505279227, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.604759693145752, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.862701416015625, + "num_tokens": 120014207.0, + "step": 3141 + }, + { + "epoch": 0.3996946953313828, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5333434343338013, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8765771985054016, + "num_tokens": 120052631.0, + "step": 3142 + }, + { + "epoch": 0.39982190560997327, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5152556896209717, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8749227523803711, + "num_tokens": 120092485.0, + "step": 3143 + }, + { + "epoch": 0.3999491158885638, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.676951289176941, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8584513664245605, + "num_tokens": 120125550.0, + "step": 3144 + }, + { + "epoch": 0.40007632616715433, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.641537070274353, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8379431962966919, + "num_tokens": 120165332.0, + "step": 3145 + }, + { + "epoch": 0.4002035364457448, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5928236246109009, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8462445735931396, + "num_tokens": 120202833.0, + "step": 3146 + }, + { + "epoch": 0.40033074672433533, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.608028531074524, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8689892888069153, + "num_tokens": 120237420.0, + "step": 3147 + }, + { + "epoch": 0.40045795700292586, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.7116680145263672, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8557678461074829, + "num_tokens": 120270437.0, + "step": 3148 + }, + { + "epoch": 0.40058516728151633, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5518810749053955, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8578537702560425, + "num_tokens": 120312233.0, + "step": 3149 + }, + { + "epoch": 0.40071237756010686, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5839391946792603, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8727238178253174, + "num_tokens": 120345436.0, + "step": 3150 + }, + { + "epoch": 0.4008395878386974, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.538856029510498, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8614969253540039, + "num_tokens": 120385773.0, + "step": 3151 + }, + { + "epoch": 0.40096679811728786, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.6104613542556763, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8664263486862183, + "num_tokens": 120421409.0, + "step": 3152 + }, + { + "epoch": 0.4010940083958784, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.611434817314148, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.869766116142273, + "num_tokens": 120459703.0, + "step": 3153 + }, + { + "epoch": 0.4012212186744689, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.5503901243209839, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8659959435462952, + "num_tokens": 120500908.0, + "step": 3154 + }, + { + "epoch": 0.4013484289530594, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.4752084016799927, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8744809627532959, + "num_tokens": 120541322.0, + "step": 3155 + }, + { + "epoch": 0.4014756392316499, + "ewc_loss": 1.239776611328125e-05, + "grad_norm": 1.7007256746292114, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8659501671791077, + "num_tokens": 120575077.0, + "step": 3156 + }, + { + "epoch": 0.40160284951024044, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.5721946954727173, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8413329124450684, + "num_tokens": 120617291.0, + "step": 3157 + }, + { + "epoch": 0.4017300597888309, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.66965913772583, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8699880838394165, + "num_tokens": 120652566.0, + "step": 3158 + }, + { + "epoch": 0.40185727006742145, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.598896861076355, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8571890592575073, + "num_tokens": 120691209.0, + "step": 3159 + }, + { + "epoch": 0.401984480346012, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.5431952476501465, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8593696355819702, + "num_tokens": 120729837.0, + "step": 3160 + }, + { + "epoch": 0.40211169062460245, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.5719856023788452, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8702706098556519, + "num_tokens": 120767618.0, + "step": 3161 + }, + { + "epoch": 0.402238900903193, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.4592474699020386, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8841690421104431, + "num_tokens": 120807253.0, + "step": 3162 + }, + { + "epoch": 0.4023661111817835, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.6443874835968018, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8643765449523926, + "num_tokens": 120842760.0, + "step": 3163 + }, + { + "epoch": 0.402493321460374, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.67533278465271, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8628231883049011, + "num_tokens": 120879857.0, + "step": 3164 + }, + { + "epoch": 0.4026205317389645, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.5882996320724487, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8567395806312561, + "num_tokens": 120920081.0, + "step": 3165 + }, + { + "epoch": 0.40274774201755503, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.4759275913238525, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.866633415222168, + "num_tokens": 120958795.0, + "step": 3166 + }, + { + "epoch": 0.4028749522961455, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.5130695104599, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.864989161491394, + "num_tokens": 120999547.0, + "step": 3167 + }, + { + "epoch": 0.40300216257473603, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.8507276773452759, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8653904795646667, + "num_tokens": 121030027.0, + "step": 3168 + }, + { + "epoch": 0.40312937285332656, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.622106909751892, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8412382006645203, + "num_tokens": 121070725.0, + "step": 3169 + }, + { + "epoch": 0.40325658313191703, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.5280115604400635, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8640683889389038, + "num_tokens": 121108529.0, + "step": 3170 + }, + { + "epoch": 0.40338379341050756, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.4801111221313477, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8682712316513062, + "num_tokens": 121147691.0, + "step": 3171 + }, + { + "epoch": 0.4035110036890981, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.6257086992263794, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8648940920829773, + "num_tokens": 121182585.0, + "step": 3172 + }, + { + "epoch": 0.40363821396768856, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.5829567909240723, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8596308827400208, + "num_tokens": 121219394.0, + "step": 3173 + }, + { + "epoch": 0.4037654242462791, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.4074201583862305, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8724576234817505, + "num_tokens": 121263233.0, + "step": 3174 + }, + { + "epoch": 0.4038926345248696, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.6334762573242188, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8601847887039185, + "num_tokens": 121303909.0, + "step": 3175 + }, + { + "epoch": 0.4040198448034601, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.837307095527649, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8547080159187317, + "num_tokens": 121332351.0, + "step": 3176 + }, + { + "epoch": 0.4041470550820506, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.552783727645874, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8641455769538879, + "num_tokens": 121375014.0, + "step": 3177 + }, + { + "epoch": 0.40427426536064115, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.6752201318740845, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8562470078468323, + "num_tokens": 121410964.0, + "step": 3178 + }, + { + "epoch": 0.4044014756392316, + "ewc_loss": 1.245737075805664e-05, + "grad_norm": 1.6408571004867554, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8515185117721558, + "num_tokens": 121447927.0, + "step": 3179 + }, + { + "epoch": 0.40452868591782215, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.640794038772583, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8673374652862549, + "num_tokens": 121482055.0, + "step": 3180 + }, + { + "epoch": 0.4046558961964127, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.646592378616333, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8484531044960022, + "num_tokens": 121521992.0, + "step": 3181 + }, + { + "epoch": 0.4047831064750032, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.7058933973312378, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8523842096328735, + "num_tokens": 121556273.0, + "step": 3182 + }, + { + "epoch": 0.4049103167535937, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.6841135025024414, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8562912344932556, + "num_tokens": 121587728.0, + "step": 3183 + }, + { + "epoch": 0.4050375270321842, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.7277073860168457, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8353844881057739, + "num_tokens": 121624907.0, + "step": 3184 + }, + { + "epoch": 0.40516473731077474, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.6325913667678833, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8608643412590027, + "num_tokens": 121661609.0, + "step": 3185 + }, + { + "epoch": 0.4052919475893652, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.765439510345459, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8575701713562012, + "num_tokens": 121696597.0, + "step": 3186 + }, + { + "epoch": 0.40541915786795574, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.5604766607284546, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8622674345970154, + "num_tokens": 121733552.0, + "step": 3187 + }, + { + "epoch": 0.40554636814654627, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.6203176975250244, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8696416616439819, + "num_tokens": 121773165.0, + "step": 3188 + }, + { + "epoch": 0.40567357842513674, + "ewc_loss": 1.2576580047607422e-05, + "grad_norm": 1.5719428062438965, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8558192849159241, + "num_tokens": 121818540.0, + "step": 3189 + }, + { + "epoch": 0.40580078870372727, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.4442416429519653, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8555968999862671, + "num_tokens": 121862667.0, + "step": 3190 + }, + { + "epoch": 0.4059279989823178, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.6952056884765625, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8432211875915527, + "num_tokens": 121900328.0, + "step": 3191 + }, + { + "epoch": 0.40605520926090827, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.778928518295288, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8545024394989014, + "num_tokens": 121931005.0, + "step": 3192 + }, + { + "epoch": 0.4061824195394988, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.6348401308059692, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8575633764266968, + "num_tokens": 121971914.0, + "step": 3193 + }, + { + "epoch": 0.4063096298180893, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.490043044090271, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.859742283821106, + "num_tokens": 122017057.0, + "step": 3194 + }, + { + "epoch": 0.4064368400966798, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.5978285074234009, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8760221004486084, + "num_tokens": 122055140.0, + "step": 3195 + }, + { + "epoch": 0.4065640503752703, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.4855780601501465, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8559802770614624, + "num_tokens": 122097706.0, + "step": 3196 + }, + { + "epoch": 0.40669126065386085, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.724979043006897, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.868750274181366, + "num_tokens": 122128780.0, + "step": 3197 + }, + { + "epoch": 0.4068184709324513, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.5997283458709717, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8628560304641724, + "num_tokens": 122166106.0, + "step": 3198 + }, + { + "epoch": 0.40694568121104185, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.7619110345840454, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8567168116569519, + "num_tokens": 122205167.0, + "step": 3199 + }, + { + "epoch": 0.4070728914896324, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.7316246032714844, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8441125154495239, + "num_tokens": 122244733.0, + "step": 3200 + }, + { + "epoch": 0.40720010176822286, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.5437512397766113, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8532359600067139, + "num_tokens": 122285853.0, + "step": 3201 + }, + { + "epoch": 0.4073273120468134, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.625029444694519, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8663875460624695, + "num_tokens": 122323422.0, + "step": 3202 + }, + { + "epoch": 0.4074545223254039, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.6003198623657227, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.861389696598053, + "num_tokens": 122358805.0, + "step": 3203 + }, + { + "epoch": 0.4075817326039944, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.679142951965332, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8595987558364868, + "num_tokens": 122393768.0, + "step": 3204 + }, + { + "epoch": 0.4077089428825849, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 2.10276460647583, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8662264347076416, + "num_tokens": 122423750.0, + "step": 3205 + }, + { + "epoch": 0.40783615316117544, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.781491994857788, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8440693020820618, + "num_tokens": 122461628.0, + "step": 3206 + }, + { + "epoch": 0.4079633634397659, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.7849271297454834, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8619321584701538, + "num_tokens": 122496921.0, + "step": 3207 + }, + { + "epoch": 0.40809057371835644, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.6670056581497192, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8631590604782104, + "num_tokens": 122528389.0, + "step": 3208 + }, + { + "epoch": 0.40821778399694697, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.7077347040176392, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8487895727157593, + "num_tokens": 122563460.0, + "step": 3209 + }, + { + "epoch": 0.40834499427553744, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.6655436754226685, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8523751497268677, + "num_tokens": 122601503.0, + "step": 3210 + }, + { + "epoch": 0.40847220455412797, + "ewc_loss": 1.2695789337158203e-05, + "grad_norm": 1.6109774112701416, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8652154207229614, + "num_tokens": 122635464.0, + "step": 3211 + }, + { + "epoch": 0.4085994148327185, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.5428518056869507, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8690676689147949, + "num_tokens": 122676637.0, + "step": 3212 + }, + { + "epoch": 0.40872662511130897, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.5592797994613647, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8495061993598938, + "num_tokens": 122720546.0, + "step": 3213 + }, + { + "epoch": 0.4088538353898995, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.6094160079956055, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8636115193367004, + "num_tokens": 122758179.0, + "step": 3214 + }, + { + "epoch": 0.40898104566849003, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.6206748485565186, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.866895318031311, + "num_tokens": 122794918.0, + "step": 3215 + }, + { + "epoch": 0.4091082559470805, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.6793017387390137, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8489222526550293, + "num_tokens": 122830039.0, + "step": 3216 + }, + { + "epoch": 0.40923546622567103, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.6147774457931519, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8488622307777405, + "num_tokens": 122867358.0, + "step": 3217 + }, + { + "epoch": 0.40936267650426156, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.6205958127975464, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8596378564834595, + "num_tokens": 122904607.0, + "step": 3218 + }, + { + "epoch": 0.40948988678285203, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.642402172088623, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8510656952857971, + "num_tokens": 122943114.0, + "step": 3219 + }, + { + "epoch": 0.40961709706144256, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.5858535766601562, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8680217266082764, + "num_tokens": 122981100.0, + "step": 3220 + }, + { + "epoch": 0.4097443073400331, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.471845269203186, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8624065518379211, + "num_tokens": 123024450.0, + "step": 3221 + }, + { + "epoch": 0.40987151761862356, + "ewc_loss": 1.2755393981933594e-05, + "grad_norm": 1.5221590995788574, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8628894686698914, + "num_tokens": 123064016.0, + "step": 3222 + }, + { + "epoch": 0.4099987278972141, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.7028812170028687, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8575299978256226, + "num_tokens": 123099606.0, + "step": 3223 + }, + { + "epoch": 0.4101259381758046, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.6538424491882324, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8518926501274109, + "num_tokens": 123133789.0, + "step": 3224 + }, + { + "epoch": 0.4102531484543951, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.6854356527328491, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8607703447341919, + "num_tokens": 123171974.0, + "step": 3225 + }, + { + "epoch": 0.4103803587329856, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.5279041528701782, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8835313320159912, + "num_tokens": 123205872.0, + "step": 3226 + }, + { + "epoch": 0.41050756901157615, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.7734556198120117, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8564988374710083, + "num_tokens": 123237713.0, + "step": 3227 + }, + { + "epoch": 0.4106347792901666, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.51836097240448, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8554840683937073, + "num_tokens": 123280825.0, + "step": 3228 + }, + { + "epoch": 0.41076198956875715, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.7457554340362549, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8508249521255493, + "num_tokens": 123315774.0, + "step": 3229 + }, + { + "epoch": 0.4108891998473477, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.5861036777496338, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8516967296600342, + "num_tokens": 123354914.0, + "step": 3230 + }, + { + "epoch": 0.4110164101259382, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.3991376161575317, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8698998689651489, + "num_tokens": 123398427.0, + "step": 3231 + }, + { + "epoch": 0.4111436204045287, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.5795923471450806, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8618278503417969, + "num_tokens": 123436238.0, + "step": 3232 + }, + { + "epoch": 0.4112708306831192, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.544630765914917, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8614782094955444, + "num_tokens": 123474265.0, + "step": 3233 + }, + { + "epoch": 0.41139804096170973, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.654951572418213, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8695070743560791, + "num_tokens": 123508941.0, + "step": 3234 + }, + { + "epoch": 0.4115252512403002, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.771108865737915, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8474640846252441, + "num_tokens": 123548471.0, + "step": 3235 + }, + { + "epoch": 0.41165246151889073, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.7556265592575073, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8774312734603882, + "num_tokens": 123578313.0, + "step": 3236 + }, + { + "epoch": 0.41177967179748126, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.6914137601852417, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8529688119888306, + "num_tokens": 123618041.0, + "step": 3237 + }, + { + "epoch": 0.41190688207607173, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.679941177368164, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8423604369163513, + "num_tokens": 123656828.0, + "step": 3238 + }, + { + "epoch": 0.41203409235466226, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.648458480834961, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8468513488769531, + "num_tokens": 123695441.0, + "step": 3239 + }, + { + "epoch": 0.4121613026332528, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.7195658683776855, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8640817403793335, + "num_tokens": 123730787.0, + "step": 3240 + }, + { + "epoch": 0.41228851291184326, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.3823858499526978, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8643040657043457, + "num_tokens": 123778636.0, + "step": 3241 + }, + { + "epoch": 0.4124157231904338, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.5346876382827759, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8518275022506714, + "num_tokens": 123820704.0, + "step": 3242 + }, + { + "epoch": 0.4125429334690243, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.5982111692428589, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8430075645446777, + "num_tokens": 123859659.0, + "step": 3243 + }, + { + "epoch": 0.4126701437476148, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.636595606803894, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8662302494049072, + "num_tokens": 123894538.0, + "step": 3244 + }, + { + "epoch": 0.4127973540262053, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.4557430744171143, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8728960752487183, + "num_tokens": 123932354.0, + "step": 3245 + }, + { + "epoch": 0.41292456430479585, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.5794403553009033, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8686071634292603, + "num_tokens": 123970371.0, + "step": 3246 + }, + { + "epoch": 0.4130517745833863, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.4431418180465698, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8717220425605774, + "num_tokens": 124014002.0, + "step": 3247 + }, + { + "epoch": 0.41317898486197685, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.6143323183059692, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8665866851806641, + "num_tokens": 124049643.0, + "step": 3248 + }, + { + "epoch": 0.4133061951405674, + "ewc_loss": 1.2874603271484375e-05, + "grad_norm": 1.7053394317626953, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8587156534194946, + "num_tokens": 124084433.0, + "step": 3249 + }, + { + "epoch": 0.41343340541915785, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.7251943349838257, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8600274324417114, + "num_tokens": 124126052.0, + "step": 3250 + }, + { + "epoch": 0.4135606156977484, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.6303867101669312, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8419090509414673, + "num_tokens": 124163826.0, + "step": 3251 + }, + { + "epoch": 0.4136878259763389, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.6391520500183105, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8591733574867249, + "num_tokens": 124206402.0, + "step": 3252 + }, + { + "epoch": 0.4138150362549294, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.556291103363037, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8590506911277771, + "num_tokens": 124245069.0, + "step": 3253 + }, + { + "epoch": 0.4139422465335199, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.4904110431671143, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8484541773796082, + "num_tokens": 124291482.0, + "step": 3254 + }, + { + "epoch": 0.41406945681211044, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.5472508668899536, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8728822469711304, + "num_tokens": 124329548.0, + "step": 3255 + }, + { + "epoch": 0.4141966670907009, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.5066951513290405, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.864699125289917, + "num_tokens": 124373716.0, + "step": 3256 + }, + { + "epoch": 0.41432387736929144, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.6144301891326904, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.857516884803772, + "num_tokens": 124413057.0, + "step": 3257 + }, + { + "epoch": 0.41445108764788197, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.5133554935455322, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8531851768493652, + "num_tokens": 124454026.0, + "step": 3258 + }, + { + "epoch": 0.41457829792647244, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.5749982595443726, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8689815402030945, + "num_tokens": 124490372.0, + "step": 3259 + }, + { + "epoch": 0.41470550820506297, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.5701792240142822, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8705258369445801, + "num_tokens": 124529213.0, + "step": 3260 + }, + { + "epoch": 0.4148327184836535, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.5927841663360596, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.860948383808136, + "num_tokens": 124567115.0, + "step": 3261 + }, + { + "epoch": 0.41495992876224397, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.6004574298858643, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.863641083240509, + "num_tokens": 124604368.0, + "step": 3262 + }, + { + "epoch": 0.4150871390408345, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.4969533681869507, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8694837689399719, + "num_tokens": 124648183.0, + "step": 3263 + }, + { + "epoch": 0.415214349319425, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.6837236881256104, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8504825830459595, + "num_tokens": 124683901.0, + "step": 3264 + }, + { + "epoch": 0.4153415595980155, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.6863828897476196, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8392690420150757, + "num_tokens": 124721612.0, + "step": 3265 + }, + { + "epoch": 0.415468769876606, + "ewc_loss": 1.2934207916259766e-05, + "grad_norm": 1.520247220993042, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8643933534622192, + "num_tokens": 124765870.0, + "step": 3266 + }, + { + "epoch": 0.41559598015519655, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6925824880599976, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.850529670715332, + "num_tokens": 124802449.0, + "step": 3267 + }, + { + "epoch": 0.415723190433787, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.4961048364639282, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8611756563186646, + "num_tokens": 124846573.0, + "step": 3268 + }, + { + "epoch": 0.41585040071237755, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7433487176895142, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8529889583587646, + "num_tokens": 124883095.0, + "step": 3269 + }, + { + "epoch": 0.4159776109909681, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7174245119094849, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.859633207321167, + "num_tokens": 124916076.0, + "step": 3270 + }, + { + "epoch": 0.41610482126955856, + "ewc_loss": 1.3053417205810547e-05, + "grad_norm": 1.6519502401351929, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.855796754360199, + "num_tokens": 124951611.0, + "step": 3271 + }, + { + "epoch": 0.4162320315481491, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5188002586364746, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8712050914764404, + "num_tokens": 124991868.0, + "step": 3272 + }, + { + "epoch": 0.4163592418267396, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5808814764022827, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8648723363876343, + "num_tokens": 125028777.0, + "step": 3273 + }, + { + "epoch": 0.4164864521053301, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5903234481811523, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8587943315505981, + "num_tokens": 125068873.0, + "step": 3274 + }, + { + "epoch": 0.4166136623839206, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7392770051956177, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8512494564056396, + "num_tokens": 125106287.0, + "step": 3275 + }, + { + "epoch": 0.41674087266251114, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.662577509880066, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8630139827728271, + "num_tokens": 125144166.0, + "step": 3276 + }, + { + "epoch": 0.4168680829411016, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5521844625473022, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8737848997116089, + "num_tokens": 125180832.0, + "step": 3277 + }, + { + "epoch": 0.41699529321969214, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7118557691574097, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8504273891448975, + "num_tokens": 125215398.0, + "step": 3278 + }, + { + "epoch": 0.41712250349828267, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5357152223587036, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8690782189369202, + "num_tokens": 125256841.0, + "step": 3279 + }, + { + "epoch": 0.4172497137768732, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.924355387687683, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8593524694442749, + "num_tokens": 125287626.0, + "step": 3280 + }, + { + "epoch": 0.41737692405546367, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6909723281860352, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8650972843170166, + "num_tokens": 125323860.0, + "step": 3281 + }, + { + "epoch": 0.4175041343340542, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5462768077850342, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8732520341873169, + "num_tokens": 125359796.0, + "step": 3282 + }, + { + "epoch": 0.41763134461264473, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5572396516799927, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8569748997688293, + "num_tokens": 125401184.0, + "step": 3283 + }, + { + "epoch": 0.4177585548912352, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5619009733200073, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8563181161880493, + "num_tokens": 125440267.0, + "step": 3284 + }, + { + "epoch": 0.41788576516982573, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7976980209350586, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8528503775596619, + "num_tokens": 125476819.0, + "step": 3285 + }, + { + "epoch": 0.41801297544841626, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6555616855621338, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8546288013458252, + "num_tokens": 125516227.0, + "step": 3286 + }, + { + "epoch": 0.41814018572700673, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6796846389770508, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.861573338508606, + "num_tokens": 125553846.0, + "step": 3287 + }, + { + "epoch": 0.41826739600559726, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6837575435638428, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8598127365112305, + "num_tokens": 125595452.0, + "step": 3288 + }, + { + "epoch": 0.4183946062841878, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7712143659591675, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8573250770568848, + "num_tokens": 125630630.0, + "step": 3289 + }, + { + "epoch": 0.41852181656277826, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5475739240646362, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.860105574131012, + "num_tokens": 125669665.0, + "step": 3290 + }, + { + "epoch": 0.4186490268413688, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6625564098358154, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.85957270860672, + "num_tokens": 125709016.0, + "step": 3291 + }, + { + "epoch": 0.4187762371199593, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.866916537284851, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8635786771774292, + "num_tokens": 125736995.0, + "step": 3292 + }, + { + "epoch": 0.4189034473985498, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5585373640060425, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8693607449531555, + "num_tokens": 125772838.0, + "step": 3293 + }, + { + "epoch": 0.4190306576771403, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6632128953933716, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8471422791481018, + "num_tokens": 125809523.0, + "step": 3294 + }, + { + "epoch": 0.41915786795573085, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6749135255813599, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.850294828414917, + "num_tokens": 125845604.0, + "step": 3295 + }, + { + "epoch": 0.4192850782343213, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5398666858673096, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8684563040733337, + "num_tokens": 125884613.0, + "step": 3296 + }, + { + "epoch": 0.41941228851291185, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.661188006401062, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8719100952148438, + "num_tokens": 125921526.0, + "step": 3297 + }, + { + "epoch": 0.4195394987915024, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5545166730880737, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8550422787666321, + "num_tokens": 125962711.0, + "step": 3298 + }, + { + "epoch": 0.41966670907009285, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5316299200057983, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8702377676963806, + "num_tokens": 126001530.0, + "step": 3299 + }, + { + "epoch": 0.4197939193486834, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7235941886901855, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8514586091041565, + "num_tokens": 126039202.0, + "step": 3300 + }, + { + "epoch": 0.4199211296272739, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5059067010879517, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8669850826263428, + "num_tokens": 126079330.0, + "step": 3301 + }, + { + "epoch": 0.4200483399058644, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5837783813476562, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8526656627655029, + "num_tokens": 126119022.0, + "step": 3302 + }, + { + "epoch": 0.4201755501844549, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6836313009262085, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.857330858707428, + "num_tokens": 126156053.0, + "step": 3303 + }, + { + "epoch": 0.42030276046304543, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5445857048034668, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.860613226890564, + "num_tokens": 126195639.0, + "step": 3304 + }, + { + "epoch": 0.4204299707416359, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6087311506271362, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8544695973396301, + "num_tokens": 126234632.0, + "step": 3305 + }, + { + "epoch": 0.42055718102022643, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6230921745300293, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8495072722434998, + "num_tokens": 126272425.0, + "step": 3306 + }, + { + "epoch": 0.42068439129881696, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6233832836151123, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8556573390960693, + "num_tokens": 126310748.0, + "step": 3307 + }, + { + "epoch": 0.42081160157740743, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5478157997131348, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8534042835235596, + "num_tokens": 126353104.0, + "step": 3308 + }, + { + "epoch": 0.42093881185599796, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.557081937789917, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.850069522857666, + "num_tokens": 126399202.0, + "step": 3309 + }, + { + "epoch": 0.4210660221345885, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5966508388519287, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8467868566513062, + "num_tokens": 126437669.0, + "step": 3310 + }, + { + "epoch": 0.42119323241317896, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6379916667938232, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8796125054359436, + "num_tokens": 126473174.0, + "step": 3311 + }, + { + "epoch": 0.4213204426917695, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6582809686660767, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8509102463722229, + "num_tokens": 126510818.0, + "step": 3312 + }, + { + "epoch": 0.42144765297036, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5051372051239014, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8686197996139526, + "num_tokens": 126555063.0, + "step": 3313 + }, + { + "epoch": 0.4215748632489505, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.57188880443573, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8670316338539124, + "num_tokens": 126592124.0, + "step": 3314 + }, + { + "epoch": 0.421702073527541, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5636793375015259, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.859585702419281, + "num_tokens": 126633588.0, + "step": 3315 + }, + { + "epoch": 0.42182928380613155, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6476404666900635, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.847555935382843, + "num_tokens": 126673684.0, + "step": 3316 + }, + { + "epoch": 0.421956494084722, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.77168607711792, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8609576225280762, + "num_tokens": 126707611.0, + "step": 3317 + }, + { + "epoch": 0.42208370436331255, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5499457120895386, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.859524130821228, + "num_tokens": 126747426.0, + "step": 3318 + }, + { + "epoch": 0.4222109146419031, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5642319917678833, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8741586208343506, + "num_tokens": 126782799.0, + "step": 3319 + }, + { + "epoch": 0.42233812492049355, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.4685558080673218, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8747546672821045, + "num_tokens": 126822974.0, + "step": 3320 + }, + { + "epoch": 0.4224653351990841, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.608904242515564, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8488584160804749, + "num_tokens": 126861461.0, + "step": 3321 + }, + { + "epoch": 0.4225925454776746, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5523070096969604, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8462585210800171, + "num_tokens": 126901899.0, + "step": 3322 + }, + { + "epoch": 0.4227197557562651, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6740676164627075, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8616205453872681, + "num_tokens": 126942377.0, + "step": 3323 + }, + { + "epoch": 0.4228469660348556, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6343934535980225, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8493369817733765, + "num_tokens": 126982919.0, + "step": 3324 + }, + { + "epoch": 0.42297417631344614, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6645097732543945, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8759287595748901, + "num_tokens": 127014275.0, + "step": 3325 + }, + { + "epoch": 0.4231013865920366, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.5403103828430176, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.875090479850769, + "num_tokens": 127054332.0, + "step": 3326 + }, + { + "epoch": 0.42322859687062714, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7538278102874756, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8520342707633972, + "num_tokens": 127089223.0, + "step": 3327 + }, + { + "epoch": 0.42335580714921767, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.6511915922164917, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8689562082290649, + "num_tokens": 127125361.0, + "step": 3328 + }, + { + "epoch": 0.42348301742780814, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.7063566446304321, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8569820523262024, + "num_tokens": 127160345.0, + "step": 3329 + }, + { + "epoch": 0.42361022770639867, + "ewc_loss": 1.2993812561035156e-05, + "grad_norm": 1.733755350112915, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8579112887382507, + "num_tokens": 127195078.0, + "step": 3330 + }, + { + "epoch": 0.4237374379849892, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.7049604654312134, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8620277643203735, + "num_tokens": 127232381.0, + "step": 3331 + }, + { + "epoch": 0.4238646482635797, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5520435571670532, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8642753958702087, + "num_tokens": 127268731.0, + "step": 3332 + }, + { + "epoch": 0.4239918585421702, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.638951301574707, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8524867296218872, + "num_tokens": 127305455.0, + "step": 3333 + }, + { + "epoch": 0.4241190688207607, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6907941102981567, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8644202351570129, + "num_tokens": 127339924.0, + "step": 3334 + }, + { + "epoch": 0.42424627909935125, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5952070951461792, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8570239543914795, + "num_tokens": 127375413.0, + "step": 3335 + }, + { + "epoch": 0.4243734893779417, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6661587953567505, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8522400856018066, + "num_tokens": 127410872.0, + "step": 3336 + }, + { + "epoch": 0.42450069965653225, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5569727420806885, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.843856930732727, + "num_tokens": 127451948.0, + "step": 3337 + }, + { + "epoch": 0.4246279099351228, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6647638082504272, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8540385961532593, + "num_tokens": 127490441.0, + "step": 3338 + }, + { + "epoch": 0.42475512021371326, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6381824016571045, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8724545240402222, + "num_tokens": 127530809.0, + "step": 3339 + }, + { + "epoch": 0.4248823304923038, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.711670160293579, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8438353538513184, + "num_tokens": 127566957.0, + "step": 3340 + }, + { + "epoch": 0.4250095407708943, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6714783906936646, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.863332986831665, + "num_tokens": 127604675.0, + "step": 3341 + }, + { + "epoch": 0.4251367510494848, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.686879277229309, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8697566986083984, + "num_tokens": 127635201.0, + "step": 3342 + }, + { + "epoch": 0.4252639613280753, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.580191969871521, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8578193783760071, + "num_tokens": 127670321.0, + "step": 3343 + }, + { + "epoch": 0.42539117160666584, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.513232946395874, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8600786924362183, + "num_tokens": 127712654.0, + "step": 3344 + }, + { + "epoch": 0.4255183818852563, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5544358491897583, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8494405746459961, + "num_tokens": 127755051.0, + "step": 3345 + }, + { + "epoch": 0.42564559216384684, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.527629017829895, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8729708790779114, + "num_tokens": 127793949.0, + "step": 3346 + }, + { + "epoch": 0.42577280244243737, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5835356712341309, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8595016002655029, + "num_tokens": 127833632.0, + "step": 3347 + }, + { + "epoch": 0.42590001272102784, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5989978313446045, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8626123666763306, + "num_tokens": 127871756.0, + "step": 3348 + }, + { + "epoch": 0.42602722299961837, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 2.005105495452881, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8627541065216064, + "num_tokens": 127908305.0, + "step": 3349 + }, + { + "epoch": 0.4261544332782089, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.8289337158203125, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8671659827232361, + "num_tokens": 127941904.0, + "step": 3350 + }, + { + "epoch": 0.4262816435567994, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5982223749160767, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8532531261444092, + "num_tokens": 127984077.0, + "step": 3351 + }, + { + "epoch": 0.4264088538353899, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5802767276763916, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.875862717628479, + "num_tokens": 128023751.0, + "step": 3352 + }, + { + "epoch": 0.42653606411398043, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6667944192886353, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8599367737770081, + "num_tokens": 128059496.0, + "step": 3353 + }, + { + "epoch": 0.4266632743925709, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5518832206726074, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8675898909568787, + "num_tokens": 128098349.0, + "step": 3354 + }, + { + "epoch": 0.42679048467116143, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.4670393466949463, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8761083483695984, + "num_tokens": 128138003.0, + "step": 3355 + }, + { + "epoch": 0.42691769494975196, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5769401788711548, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8521133661270142, + "num_tokens": 128180833.0, + "step": 3356 + }, + { + "epoch": 0.42704490522834243, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5529061555862427, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8584150075912476, + "num_tokens": 128221864.0, + "step": 3357 + }, + { + "epoch": 0.42717211550693296, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.467079520225525, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8712607026100159, + "num_tokens": 128264553.0, + "step": 3358 + }, + { + "epoch": 0.4272993257855235, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.4959349632263184, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8495953679084778, + "num_tokens": 128308039.0, + "step": 3359 + }, + { + "epoch": 0.42742653606411396, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5691429376602173, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8588818907737732, + "num_tokens": 128344593.0, + "step": 3360 + }, + { + "epoch": 0.4275537463427045, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5786391496658325, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8722765445709229, + "num_tokens": 128379509.0, + "step": 3361 + }, + { + "epoch": 0.427680956621295, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6113137006759644, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8665546178817749, + "num_tokens": 128414127.0, + "step": 3362 + }, + { + "epoch": 0.4278081668998855, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.4750816822052002, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8826520442962646, + "num_tokens": 128453007.0, + "step": 3363 + }, + { + "epoch": 0.427935377178476, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6192072629928589, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8726423978805542, + "num_tokens": 128487872.0, + "step": 3364 + }, + { + "epoch": 0.42806258745706655, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.601720929145813, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8615807294845581, + "num_tokens": 128525429.0, + "step": 3365 + }, + { + "epoch": 0.428189797735657, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.565626859664917, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8493065237998962, + "num_tokens": 128563714.0, + "step": 3366 + }, + { + "epoch": 0.42831700801424755, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6191010475158691, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8666393756866455, + "num_tokens": 128598260.0, + "step": 3367 + }, + { + "epoch": 0.4284442182928381, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5342752933502197, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8612292408943176, + "num_tokens": 128641233.0, + "step": 3368 + }, + { + "epoch": 0.42857142857142855, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6005131006240845, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8564116954803467, + "num_tokens": 128682458.0, + "step": 3369 + }, + { + "epoch": 0.4286986388500191, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.4377340078353882, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8567852973937988, + "num_tokens": 128726764.0, + "step": 3370 + }, + { + "epoch": 0.4288258491286096, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5945862531661987, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8580318093299866, + "num_tokens": 128761048.0, + "step": 3371 + }, + { + "epoch": 0.4289530594072001, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.7432758808135986, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8674341440200806, + "num_tokens": 128794729.0, + "step": 3372 + }, + { + "epoch": 0.4290802696857906, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6877336502075195, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8545125722885132, + "num_tokens": 128832617.0, + "step": 3373 + }, + { + "epoch": 0.42920747996438113, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.555971384048462, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8700554966926575, + "num_tokens": 128869752.0, + "step": 3374 + }, + { + "epoch": 0.4293346902429716, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.65651273727417, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8776484727859497, + "num_tokens": 128907622.0, + "step": 3375 + }, + { + "epoch": 0.42946190052156213, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.8042964935302734, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8545716404914856, + "num_tokens": 128944638.0, + "step": 3376 + }, + { + "epoch": 0.42958911080015266, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5731961727142334, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8586269021034241, + "num_tokens": 128982797.0, + "step": 3377 + }, + { + "epoch": 0.42971632107874314, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.8354053497314453, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8362343311309814, + "num_tokens": 129017684.0, + "step": 3378 + }, + { + "epoch": 0.42984353135733366, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.600857138633728, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.863871693611145, + "num_tokens": 129054424.0, + "step": 3379 + }, + { + "epoch": 0.4299707416359242, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.4833821058273315, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8640855550765991, + "num_tokens": 129094874.0, + "step": 3380 + }, + { + "epoch": 0.4300979519145147, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5407400131225586, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8621935844421387, + "num_tokens": 129136178.0, + "step": 3381 + }, + { + "epoch": 0.4302251621931052, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.56462824344635, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8612065315246582, + "num_tokens": 129174678.0, + "step": 3382 + }, + { + "epoch": 0.4303523724716957, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5059397220611572, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.862613320350647, + "num_tokens": 129215103.0, + "step": 3383 + }, + { + "epoch": 0.43047958275028625, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6891062259674072, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8565908670425415, + "num_tokens": 129249261.0, + "step": 3384 + }, + { + "epoch": 0.4306067930288767, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5014283657073975, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.863696813583374, + "num_tokens": 129291218.0, + "step": 3385 + }, + { + "epoch": 0.43073400330746725, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5163459777832031, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8646795153617859, + "num_tokens": 129331637.0, + "step": 3386 + }, + { + "epoch": 0.4308612135860578, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6660065650939941, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8604033589363098, + "num_tokens": 129365228.0, + "step": 3387 + }, + { + "epoch": 0.43098842386464825, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6190329790115356, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8451977372169495, + "num_tokens": 129405461.0, + "step": 3388 + }, + { + "epoch": 0.4311156341432388, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.4975475072860718, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8589465022087097, + "num_tokens": 129447194.0, + "step": 3389 + }, + { + "epoch": 0.4312428444218293, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6663744449615479, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8724738955497742, + "num_tokens": 129480196.0, + "step": 3390 + }, + { + "epoch": 0.4313700547004198, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6535688638687134, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8524140119552612, + "num_tokens": 129514418.0, + "step": 3391 + }, + { + "epoch": 0.4314972649790103, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.7208105325698853, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8483489155769348, + "num_tokens": 129551177.0, + "step": 3392 + }, + { + "epoch": 0.43162447525760084, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.672743797302246, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.848644495010376, + "num_tokens": 129588210.0, + "step": 3393 + }, + { + "epoch": 0.4317516855361913, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.599534511566162, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8465703725814819, + "num_tokens": 129628355.0, + "step": 3394 + }, + { + "epoch": 0.43187889581478184, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5502538681030273, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8604880571365356, + "num_tokens": 129667768.0, + "step": 3395 + }, + { + "epoch": 0.43200610609337237, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5794932842254639, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8580071330070496, + "num_tokens": 129707718.0, + "step": 3396 + }, + { + "epoch": 0.43213331637196284, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5498499870300293, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8676198720932007, + "num_tokens": 129746733.0, + "step": 3397 + }, + { + "epoch": 0.43226052665055337, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.7039605379104614, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8605145215988159, + "num_tokens": 129780583.0, + "step": 3398 + }, + { + "epoch": 0.4323877369291439, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.7327375411987305, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8704133033752441, + "num_tokens": 129813763.0, + "step": 3399 + }, + { + "epoch": 0.43251494720773437, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6313904523849487, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.857032060623169, + "num_tokens": 129851231.0, + "step": 3400 + }, + { + "epoch": 0.4326421574863249, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5391637086868286, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8735607862472534, + "num_tokens": 129892411.0, + "step": 3401 + }, + { + "epoch": 0.4327693677649154, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6732009649276733, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8616166114807129, + "num_tokens": 129925597.0, + "step": 3402 + }, + { + "epoch": 0.4328965780435059, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.551081657409668, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8660895824432373, + "num_tokens": 129965462.0, + "step": 3403 + }, + { + "epoch": 0.4330237883220964, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6500033140182495, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8508710861206055, + "num_tokens": 130004747.0, + "step": 3404 + }, + { + "epoch": 0.43315099860068695, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5256969928741455, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8643859624862671, + "num_tokens": 130049694.0, + "step": 3405 + }, + { + "epoch": 0.4332782088792774, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.8123083114624023, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8487039804458618, + "num_tokens": 130084588.0, + "step": 3406 + }, + { + "epoch": 0.43340541915786795, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.4648336172103882, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8451584577560425, + "num_tokens": 130131623.0, + "step": 3407 + }, + { + "epoch": 0.4335326294364585, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.8300623893737793, + "learning_rate": 1e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8351469039916992, + "num_tokens": 130164355.0, + "step": 3408 + }, + { + "epoch": 0.43365983971504896, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.690806269645691, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8647336363792419, + "num_tokens": 130196699.0, + "step": 3409 + }, + { + "epoch": 0.4337870499936395, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.4975847005844116, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8593885898590088, + "num_tokens": 130236054.0, + "step": 3410 + }, + { + "epoch": 0.43391426027223, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5908935070037842, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8464134335517883, + "num_tokens": 130275854.0, + "step": 3411 + }, + { + "epoch": 0.4340414705508205, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5304690599441528, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8722376823425293, + "num_tokens": 130315386.0, + "step": 3412 + }, + { + "epoch": 0.434168680829411, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.602028727531433, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8494750261306763, + "num_tokens": 130353474.0, + "step": 3413 + }, + { + "epoch": 0.43429589110800154, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.531656265258789, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8433294296264648, + "num_tokens": 130394241.0, + "step": 3414 + }, + { + "epoch": 0.434423101386592, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.76287841796875, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8581379055976868, + "num_tokens": 130426502.0, + "step": 3415 + }, + { + "epoch": 0.43455031166518254, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.681735634803772, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.867903470993042, + "num_tokens": 130461758.0, + "step": 3416 + }, + { + "epoch": 0.43467752194377307, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.6290446519851685, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8762192726135254, + "num_tokens": 130499390.0, + "step": 3417 + }, + { + "epoch": 0.43480473222236354, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.8104506731033325, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8563327789306641, + "num_tokens": 130530690.0, + "step": 3418 + }, + { + "epoch": 0.43493194250095407, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5433239936828613, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8715553283691406, + "num_tokens": 130573716.0, + "step": 3419 + }, + { + "epoch": 0.4350591527795446, + "ewc_loss": 1.3113021850585938e-05, + "grad_norm": 1.5752568244934082, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8383934497833252, + "num_tokens": 130616886.0, + "step": 3420 + }, + { + "epoch": 0.4351863630581351, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6094396114349365, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8486719131469727, + "num_tokens": 130654935.0, + "step": 3421 + }, + { + "epoch": 0.4353135733367256, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.6212221384048462, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8537254333496094, + "num_tokens": 130693241.0, + "step": 3422 + }, + { + "epoch": 0.43544078361531613, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.679344892501831, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8615944981575012, + "num_tokens": 130730977.0, + "step": 3423 + }, + { + "epoch": 0.4355679938939066, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.634590983390808, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.872902512550354, + "num_tokens": 130763675.0, + "step": 3424 + }, + { + "epoch": 0.43569520417249713, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.8446956872940063, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8535189628601074, + "num_tokens": 130796313.0, + "step": 3425 + }, + { + "epoch": 0.43582241445108766, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.6983485221862793, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8328518867492676, + "num_tokens": 130834728.0, + "step": 3426 + }, + { + "epoch": 0.43594962472967813, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.6845847368240356, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8577311635017395, + "num_tokens": 130870728.0, + "step": 3427 + }, + { + "epoch": 0.43607683500826866, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.5749356746673584, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8599163293838501, + "num_tokens": 130909883.0, + "step": 3428 + }, + { + "epoch": 0.4362040452868592, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.7042316198349, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8577943444252014, + "num_tokens": 130946440.0, + "step": 3429 + }, + { + "epoch": 0.4363312555654497, + "ewc_loss": 1.3172626495361328e-05, + "grad_norm": 1.696792483329773, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8485856056213379, + "num_tokens": 130980057.0, + "step": 3430 + }, + { + "epoch": 0.4364584658440402, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6657545566558838, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8651552200317383, + "num_tokens": 131015336.0, + "step": 3431 + }, + { + "epoch": 0.4365856761226307, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6680784225463867, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8732383251190186, + "num_tokens": 131051213.0, + "step": 3432 + }, + { + "epoch": 0.43671288640122125, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.701312780380249, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8483095765113831, + "num_tokens": 131087762.0, + "step": 3433 + }, + { + "epoch": 0.4368400966798117, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6382588148117065, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8690258860588074, + "num_tokens": 131124641.0, + "step": 3434 + }, + { + "epoch": 0.43696730695840225, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6705900430679321, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8440923690795898, + "num_tokens": 131165445.0, + "step": 3435 + }, + { + "epoch": 0.4370945172369928, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.5977435111999512, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8756510019302368, + "num_tokens": 131201590.0, + "step": 3436 + }, + { + "epoch": 0.43722172751558325, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.5824813842773438, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8531391620635986, + "num_tokens": 131239931.0, + "step": 3437 + }, + { + "epoch": 0.4373489377941738, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.4742511510849, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.86053466796875, + "num_tokens": 131281763.0, + "step": 3438 + }, + { + "epoch": 0.4374761480727643, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.4752631187438965, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8567554950714111, + "num_tokens": 131324857.0, + "step": 3439 + }, + { + "epoch": 0.4376033583513548, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.4770225286483765, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8670991659164429, + "num_tokens": 131369461.0, + "step": 3440 + }, + { + "epoch": 0.4377305686299453, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.5237383842468262, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8634400367736816, + "num_tokens": 131410332.0, + "step": 3441 + }, + { + "epoch": 0.43785777890853583, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.602258563041687, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8612587451934814, + "num_tokens": 131450207.0, + "step": 3442 + }, + { + "epoch": 0.4379849891871263, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.5562946796417236, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8584094047546387, + "num_tokens": 131489914.0, + "step": 3443 + }, + { + "epoch": 0.43811219946571683, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6381174325942993, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.841539204120636, + "num_tokens": 131527729.0, + "step": 3444 + }, + { + "epoch": 0.43823940974430736, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.5758010149002075, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8559637665748596, + "num_tokens": 131567212.0, + "step": 3445 + }, + { + "epoch": 0.43836662002289783, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6583139896392822, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8605025410652161, + "num_tokens": 131604840.0, + "step": 3446 + }, + { + "epoch": 0.43849383030148836, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.5624557733535767, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8830686807632446, + "num_tokens": 131642263.0, + "step": 3447 + }, + { + "epoch": 0.4386210405800789, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.551178216934204, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8669823408126831, + "num_tokens": 131683149.0, + "step": 3448 + }, + { + "epoch": 0.43874825085866936, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.5896239280700684, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8651397228240967, + "num_tokens": 131719839.0, + "step": 3449 + }, + { + "epoch": 0.4388754611372599, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6022306680679321, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8549691438674927, + "num_tokens": 131757231.0, + "step": 3450 + }, + { + "epoch": 0.4390026714158504, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.5324883460998535, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.869802713394165, + "num_tokens": 131795772.0, + "step": 3451 + }, + { + "epoch": 0.4391298816944409, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.5313050746917725, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8719784021377563, + "num_tokens": 131834955.0, + "step": 3452 + }, + { + "epoch": 0.4392570919730314, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.5570603609085083, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8724685907363892, + "num_tokens": 131871523.0, + "step": 3453 + }, + { + "epoch": 0.43938430225162195, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6030082702636719, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8477874994277954, + "num_tokens": 131909645.0, + "step": 3454 + }, + { + "epoch": 0.4395115125302124, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6009827852249146, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8699562549591064, + "num_tokens": 131944231.0, + "step": 3455 + }, + { + "epoch": 0.43963872280880295, + "ewc_loss": 1.3232231140136719e-05, + "grad_norm": 1.6976567506790161, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8623934388160706, + "num_tokens": 131979761.0, + "step": 3456 + }, + { + "epoch": 0.4397659330873935, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6784863471984863, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8497326970100403, + "num_tokens": 132015316.0, + "step": 3457 + }, + { + "epoch": 0.43989314336598395, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.8435101509094238, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.855476975440979, + "num_tokens": 132048703.0, + "step": 3458 + }, + { + "epoch": 0.4400203536445745, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.6988078355789185, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8660047054290771, + "num_tokens": 132082378.0, + "step": 3459 + }, + { + "epoch": 0.440147563923165, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.484556794166565, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8714796900749207, + "num_tokens": 132121739.0, + "step": 3460 + }, + { + "epoch": 0.4402747742017555, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.7287726402282715, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8617292642593384, + "num_tokens": 132157240.0, + "step": 3461 + }, + { + "epoch": 0.440401984480346, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6487659215927124, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8615846633911133, + "num_tokens": 132198889.0, + "step": 3462 + }, + { + "epoch": 0.44052919475893654, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6813020706176758, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8497319221496582, + "num_tokens": 132236093.0, + "step": 3463 + }, + { + "epoch": 0.440656405037527, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6093357801437378, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8626470565795898, + "num_tokens": 132271404.0, + "step": 3464 + }, + { + "epoch": 0.44078361531611754, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.7503408193588257, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8681771159172058, + "num_tokens": 132302599.0, + "step": 3465 + }, + { + "epoch": 0.44091082559470807, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6895029544830322, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.86077481508255, + "num_tokens": 132342518.0, + "step": 3466 + }, + { + "epoch": 0.44103803587329854, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.7740535736083984, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8435318470001221, + "num_tokens": 132377864.0, + "step": 3467 + }, + { + "epoch": 0.44116524615188907, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6298091411590576, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8548272848129272, + "num_tokens": 132420473.0, + "step": 3468 + }, + { + "epoch": 0.4412924564304796, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.5098758935928345, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8707329034805298, + "num_tokens": 132460642.0, + "step": 3469 + }, + { + "epoch": 0.44141966670907007, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.4687867164611816, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8680212497711182, + "num_tokens": 132500344.0, + "step": 3470 + }, + { + "epoch": 0.4415468769876606, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6948206424713135, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8478320240974426, + "num_tokens": 132541045.0, + "step": 3471 + }, + { + "epoch": 0.4416740872662511, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.739977240562439, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8516924381256104, + "num_tokens": 132576615.0, + "step": 3472 + }, + { + "epoch": 0.4418012975448416, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.8181560039520264, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8577602505683899, + "num_tokens": 132607946.0, + "step": 3473 + }, + { + "epoch": 0.4419285078234321, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.4958600997924805, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8732873201370239, + "num_tokens": 132647990.0, + "step": 3474 + }, + { + "epoch": 0.44205571810202265, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.554917812347412, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8515681028366089, + "num_tokens": 132691051.0, + "step": 3475 + }, + { + "epoch": 0.4421829283806131, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.5493831634521484, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8528319597244263, + "num_tokens": 132728004.0, + "step": 3476 + }, + { + "epoch": 0.44231013865920366, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.6210452318191528, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8618013858795166, + "num_tokens": 132762272.0, + "step": 3477 + }, + { + "epoch": 0.4424373489377942, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.7058061361312866, + "learning_rate": 1e-06, + "loss": 0.5153, + "mean_token_accuracy": 0.8332258462905884, + "num_tokens": 132797957.0, + "step": 3478 + }, + { + "epoch": 0.44256455921638466, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.5747177600860596, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8598750829696655, + "num_tokens": 132833069.0, + "step": 3479 + }, + { + "epoch": 0.4426917694949752, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.613136887550354, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8609045743942261, + "num_tokens": 132872389.0, + "step": 3480 + }, + { + "epoch": 0.4428189797735657, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.5705724954605103, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8563227653503418, + "num_tokens": 132915391.0, + "step": 3481 + }, + { + "epoch": 0.44294619005215624, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.691223382949829, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8583381772041321, + "num_tokens": 132950013.0, + "step": 3482 + }, + { + "epoch": 0.4430734003307467, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.693257451057434, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8585889339447021, + "num_tokens": 132982552.0, + "step": 3483 + }, + { + "epoch": 0.44320061060933724, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.6324917078018188, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8628771305084229, + "num_tokens": 133018609.0, + "step": 3484 + }, + { + "epoch": 0.44332782088792777, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 2.230788230895996, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8546348810195923, + "num_tokens": 133058499.0, + "step": 3485 + }, + { + "epoch": 0.44345503116651824, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.8477046489715576, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8493980765342712, + "num_tokens": 133089359.0, + "step": 3486 + }, + { + "epoch": 0.44358224144510877, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.5452731847763062, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8548648357391357, + "num_tokens": 133130601.0, + "step": 3487 + }, + { + "epoch": 0.4437094517236993, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.6315040588378906, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.861717939376831, + "num_tokens": 133167380.0, + "step": 3488 + }, + { + "epoch": 0.4438366620022898, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.5336686372756958, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8553153872489929, + "num_tokens": 133214845.0, + "step": 3489 + }, + { + "epoch": 0.4439638722808803, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.6659667491912842, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8675609827041626, + "num_tokens": 133246249.0, + "step": 3490 + }, + { + "epoch": 0.44409108255947083, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.6355102062225342, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8443264961242676, + "num_tokens": 133281943.0, + "step": 3491 + }, + { + "epoch": 0.4442182928380613, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.5375821590423584, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8634737133979797, + "num_tokens": 133319808.0, + "step": 3492 + }, + { + "epoch": 0.44434550311665183, + "ewc_loss": 1.329183578491211e-05, + "grad_norm": 1.6768991947174072, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8478174209594727, + "num_tokens": 133354752.0, + "step": 3493 + }, + { + "epoch": 0.44447271339524236, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.715503454208374, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8522288799285889, + "num_tokens": 133388388.0, + "step": 3494 + }, + { + "epoch": 0.44459992367383283, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.67914879322052, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8640697598457336, + "num_tokens": 133428225.0, + "step": 3495 + }, + { + "epoch": 0.44472713395242336, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6470582485198975, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.861880898475647, + "num_tokens": 133463052.0, + "step": 3496 + }, + { + "epoch": 0.4448543442310139, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6260775327682495, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8689221739768982, + "num_tokens": 133496323.0, + "step": 3497 + }, + { + "epoch": 0.44498155450960436, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.7678905725479126, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8761132955551147, + "num_tokens": 133525267.0, + "step": 3498 + }, + { + "epoch": 0.4451087647881949, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.604008436203003, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8560792207717896, + "num_tokens": 133564285.0, + "step": 3499 + }, + { + "epoch": 0.4452359750667854, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6557248830795288, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8552809357643127, + "num_tokens": 133600708.0, + "step": 3500 + }, + { + "epoch": 0.4453631853453759, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.5984126329421997, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8711413145065308, + "num_tokens": 133635121.0, + "step": 3501 + }, + { + "epoch": 0.4454903956239664, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.5311598777770996, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8717506527900696, + "num_tokens": 133675052.0, + "step": 3502 + }, + { + "epoch": 0.44561760590255695, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.7857893705368042, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.867660641670227, + "num_tokens": 133712606.0, + "step": 3503 + }, + { + "epoch": 0.4457448161811474, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6013249158859253, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8491092920303345, + "num_tokens": 133755137.0, + "step": 3504 + }, + { + "epoch": 0.44587202645973795, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 2.1862921714782715, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8634318113327026, + "num_tokens": 133791099.0, + "step": 3505 + }, + { + "epoch": 0.4459992367383285, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.5753346681594849, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8592915534973145, + "num_tokens": 133830615.0, + "step": 3506 + }, + { + "epoch": 0.44612644701691895, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6091176271438599, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8567530512809753, + "num_tokens": 133868813.0, + "step": 3507 + }, + { + "epoch": 0.4462536572955095, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.626165747642517, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.856814980506897, + "num_tokens": 133907726.0, + "step": 3508 + }, + { + "epoch": 0.4463808675741, + "ewc_loss": 1.33514404296875e-05, + "grad_norm": 1.6648787260055542, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8635408282279968, + "num_tokens": 133945679.0, + "step": 3509 + }, + { + "epoch": 0.4465080778526905, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6499818563461304, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8611741662025452, + "num_tokens": 133984533.0, + "step": 3510 + }, + { + "epoch": 0.446635288131281, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.594314694404602, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.853518009185791, + "num_tokens": 134022911.0, + "step": 3511 + }, + { + "epoch": 0.44676249840987153, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6788132190704346, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8669694662094116, + "num_tokens": 134059740.0, + "step": 3512 + }, + { + "epoch": 0.446889708688462, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.501650094985962, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8819741010665894, + "num_tokens": 134099953.0, + "step": 3513 + }, + { + "epoch": 0.44701691896705253, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.5025248527526855, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8649125695228577, + "num_tokens": 134136271.0, + "step": 3514 + }, + { + "epoch": 0.44714412924564306, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.7419394254684448, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8647253513336182, + "num_tokens": 134172972.0, + "step": 3515 + }, + { + "epoch": 0.44727133952423354, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6564329862594604, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8453153371810913, + "num_tokens": 134208586.0, + "step": 3516 + }, + { + "epoch": 0.44739854980282406, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.7612096071243286, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8371317386627197, + "num_tokens": 134248556.0, + "step": 3517 + }, + { + "epoch": 0.4475257600814146, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6048083305358887, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8713396787643433, + "num_tokens": 134286866.0, + "step": 3518 + }, + { + "epoch": 0.44765297036000506, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6758952140808105, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8624551296234131, + "num_tokens": 134322032.0, + "step": 3519 + }, + { + "epoch": 0.4477801806385956, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6094797849655151, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8555941581726074, + "num_tokens": 134357407.0, + "step": 3520 + }, + { + "epoch": 0.4479073909171861, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6821364164352417, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8483072519302368, + "num_tokens": 134396886.0, + "step": 3521 + }, + { + "epoch": 0.4480346011957766, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.635344386100769, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.861038088798523, + "num_tokens": 134432503.0, + "step": 3522 + }, + { + "epoch": 0.4481618114743671, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.5741980075836182, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8528208136558533, + "num_tokens": 134472657.0, + "step": 3523 + }, + { + "epoch": 0.44828902175295765, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.5851175785064697, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8611295223236084, + "num_tokens": 134511978.0, + "step": 3524 + }, + { + "epoch": 0.4484162320315481, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.7274752855300903, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8709890246391296, + "num_tokens": 134547316.0, + "step": 3525 + }, + { + "epoch": 0.44854344231013865, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.503692388534546, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.854151725769043, + "num_tokens": 134593715.0, + "step": 3526 + }, + { + "epoch": 0.4486706525887292, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.548013687133789, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8747042417526245, + "num_tokens": 134632999.0, + "step": 3527 + }, + { + "epoch": 0.44879786286731965, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6866129636764526, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8590219020843506, + "num_tokens": 134669832.0, + "step": 3528 + }, + { + "epoch": 0.4489250731459102, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.6066974401474, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8449148535728455, + "num_tokens": 134709064.0, + "step": 3529 + }, + { + "epoch": 0.4490522834245007, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.5533074140548706, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8568278551101685, + "num_tokens": 134746702.0, + "step": 3530 + }, + { + "epoch": 0.44917949370309124, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.4879372119903564, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8676824569702148, + "num_tokens": 134786754.0, + "step": 3531 + }, + { + "epoch": 0.4493067039816817, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.463793396949768, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8776227831840515, + "num_tokens": 134825915.0, + "step": 3532 + }, + { + "epoch": 0.44943391426027224, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.4896330833435059, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8747576475143433, + "num_tokens": 134870390.0, + "step": 3533 + }, + { + "epoch": 0.44956112453886277, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.5211975574493408, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8622864484786987, + "num_tokens": 134911604.0, + "step": 3534 + }, + { + "epoch": 0.44968833481745324, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.5408681631088257, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.872856080532074, + "num_tokens": 134950647.0, + "step": 3535 + }, + { + "epoch": 0.44981554509604377, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.7948375940322876, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8730724453926086, + "num_tokens": 134981812.0, + "step": 3536 + }, + { + "epoch": 0.4499427553746343, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.531553030014038, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8547389507293701, + "num_tokens": 135020953.0, + "step": 3537 + }, + { + "epoch": 0.45006996565322477, + "ewc_loss": 1.341104507446289e-05, + "grad_norm": 1.643276572227478, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8668503165245056, + "num_tokens": 135059386.0, + "step": 3538 + }, + { + "epoch": 0.4501971759318153, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.7846956253051758, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8506467938423157, + "num_tokens": 135095407.0, + "step": 3539 + }, + { + "epoch": 0.4503243862104058, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6743472814559937, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8822094202041626, + "num_tokens": 135126166.0, + "step": 3540 + }, + { + "epoch": 0.4504515964889963, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6212141513824463, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8549118638038635, + "num_tokens": 135166425.0, + "step": 3541 + }, + { + "epoch": 0.4505788067675868, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.516827940940857, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.859893798828125, + "num_tokens": 135206480.0, + "step": 3542 + }, + { + "epoch": 0.45070601704617735, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6894099712371826, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8456696271896362, + "num_tokens": 135245807.0, + "step": 3543 + }, + { + "epoch": 0.4508332273247678, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6557096242904663, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8562391996383667, + "num_tokens": 135282785.0, + "step": 3544 + }, + { + "epoch": 0.45096043760335836, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.7207057476043701, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8700296878814697, + "num_tokens": 135316379.0, + "step": 3545 + }, + { + "epoch": 0.4510876478819489, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6397074460983276, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8752261996269226, + "num_tokens": 135348433.0, + "step": 3546 + }, + { + "epoch": 0.45121485816053936, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6638115644454956, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8753401041030884, + "num_tokens": 135380804.0, + "step": 3547 + }, + { + "epoch": 0.4513420684391299, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5015321969985962, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8681834936141968, + "num_tokens": 135420957.0, + "step": 3548 + }, + { + "epoch": 0.4514692787177204, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.532063603401184, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8674197196960449, + "num_tokens": 135461221.0, + "step": 3549 + }, + { + "epoch": 0.4515964889963109, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.7362860441207886, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.850502073764801, + "num_tokens": 135497213.0, + "step": 3550 + }, + { + "epoch": 0.4517236992749014, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5592765808105469, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8605210781097412, + "num_tokens": 135537214.0, + "step": 3551 + }, + { + "epoch": 0.45185090955349194, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6659539937973022, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8647574186325073, + "num_tokens": 135570789.0, + "step": 3552 + }, + { + "epoch": 0.4519781198320824, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.4982655048370361, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8599982261657715, + "num_tokens": 135612647.0, + "step": 3553 + }, + { + "epoch": 0.45210533011067294, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5779064893722534, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8523160219192505, + "num_tokens": 135649663.0, + "step": 3554 + }, + { + "epoch": 0.45223254038926347, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.9302648305892944, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8527988791465759, + "num_tokens": 135678966.0, + "step": 3555 + }, + { + "epoch": 0.45235975066785394, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5825062990188599, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8414345979690552, + "num_tokens": 135718377.0, + "step": 3556 + }, + { + "epoch": 0.45248696094644447, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.7070990800857544, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8551214337348938, + "num_tokens": 135754597.0, + "step": 3557 + }, + { + "epoch": 0.452614171225035, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.4906718730926514, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8635861873626709, + "num_tokens": 135796501.0, + "step": 3558 + }, + { + "epoch": 0.4527413815036255, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.640692114830017, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8580570220947266, + "num_tokens": 135833696.0, + "step": 3559 + }, + { + "epoch": 0.452868591782216, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.787645697593689, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8443344235420227, + "num_tokens": 135868867.0, + "step": 3560 + }, + { + "epoch": 0.45299580206080653, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.4453752040863037, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.869977593421936, + "num_tokens": 135911227.0, + "step": 3561 + }, + { + "epoch": 0.453123012339397, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5412406921386719, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.860087513923645, + "num_tokens": 135950549.0, + "step": 3562 + }, + { + "epoch": 0.45325022261798753, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5540951490402222, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8526504039764404, + "num_tokens": 135988838.0, + "step": 3563 + }, + { + "epoch": 0.45337743289657806, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.4543845653533936, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8730840086936951, + "num_tokens": 136028707.0, + "step": 3564 + }, + { + "epoch": 0.45350464317516853, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5541054010391235, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8697207570075989, + "num_tokens": 136064630.0, + "step": 3565 + }, + { + "epoch": 0.45363185345375906, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.572008490562439, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.854705810546875, + "num_tokens": 136105700.0, + "step": 3566 + }, + { + "epoch": 0.4537590637323496, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.687501311302185, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8688626289367676, + "num_tokens": 136138317.0, + "step": 3567 + }, + { + "epoch": 0.45388627401094006, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.4643386602401733, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8555319309234619, + "num_tokens": 136179254.0, + "step": 3568 + }, + { + "epoch": 0.4540134842895306, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6673977375030518, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8470048904418945, + "num_tokens": 136218108.0, + "step": 3569 + }, + { + "epoch": 0.4541406945681211, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.7191948890686035, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8473453521728516, + "num_tokens": 136253691.0, + "step": 3570 + }, + { + "epoch": 0.4542679048467116, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5156842470169067, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8753556609153748, + "num_tokens": 136291761.0, + "step": 3571 + }, + { + "epoch": 0.4543951151253021, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5381032228469849, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.859397292137146, + "num_tokens": 136332045.0, + "step": 3572 + }, + { + "epoch": 0.45452232540389265, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5163586139678955, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8580821752548218, + "num_tokens": 136369377.0, + "step": 3573 + }, + { + "epoch": 0.4546495356824831, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.4080570936203003, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8608953952789307, + "num_tokens": 136415042.0, + "step": 3574 + }, + { + "epoch": 0.45477674596107365, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6158686876296997, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8700270652770996, + "num_tokens": 136450201.0, + "step": 3575 + }, + { + "epoch": 0.4549039562396642, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5271512269973755, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8620933294296265, + "num_tokens": 136491343.0, + "step": 3576 + }, + { + "epoch": 0.45503116651825465, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.4344425201416016, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8757111430168152, + "num_tokens": 136535304.0, + "step": 3577 + }, + { + "epoch": 0.4551583767968452, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5273903608322144, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8527970314025879, + "num_tokens": 136574886.0, + "step": 3578 + }, + { + "epoch": 0.4552855870754357, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5709471702575684, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8684440851211548, + "num_tokens": 136611250.0, + "step": 3579 + }, + { + "epoch": 0.4554127973540262, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.4132994413375854, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8576840162277222, + "num_tokens": 136657620.0, + "step": 3580 + }, + { + "epoch": 0.4555400076326167, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.6377686262130737, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8630578517913818, + "num_tokens": 136693176.0, + "step": 3581 + }, + { + "epoch": 0.45566721791120723, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5633225440979004, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8533238172531128, + "num_tokens": 136732554.0, + "step": 3582 + }, + { + "epoch": 0.45579442818979776, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.62432062625885, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8497648239135742, + "num_tokens": 136767890.0, + "step": 3583 + }, + { + "epoch": 0.45592163846838824, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.766867756843567, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8755179047584534, + "num_tokens": 136799221.0, + "step": 3584 + }, + { + "epoch": 0.45604884874697876, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5014064311981201, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8682119250297546, + "num_tokens": 136843595.0, + "step": 3585 + }, + { + "epoch": 0.4561760590255693, + "ewc_loss": 1.3470649719238281e-05, + "grad_norm": 1.5876222848892212, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8594692945480347, + "num_tokens": 136886069.0, + "step": 3586 + }, + { + "epoch": 0.45630326930415976, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.611642599105835, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8544105887413025, + "num_tokens": 136922415.0, + "step": 3587 + }, + { + "epoch": 0.4564304795827503, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.537750482559204, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8696389198303223, + "num_tokens": 136959155.0, + "step": 3588 + }, + { + "epoch": 0.4565576898613408, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5217701196670532, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8635655641555786, + "num_tokens": 136999030.0, + "step": 3589 + }, + { + "epoch": 0.4566849001399313, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.6864094734191895, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8537015914916992, + "num_tokens": 137032029.0, + "step": 3590 + }, + { + "epoch": 0.4568121104185218, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5914686918258667, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8532348275184631, + "num_tokens": 137069753.0, + "step": 3591 + }, + { + "epoch": 0.45693932069711235, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.677681565284729, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8461310863494873, + "num_tokens": 137104988.0, + "step": 3592 + }, + { + "epoch": 0.4570665309757028, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5508372783660889, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8730831146240234, + "num_tokens": 137143677.0, + "step": 3593 + }, + { + "epoch": 0.45719374125429335, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.7204949855804443, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8540224432945251, + "num_tokens": 137180744.0, + "step": 3594 + }, + { + "epoch": 0.4573209515328839, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.632557988166809, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8678160309791565, + "num_tokens": 137217077.0, + "step": 3595 + }, + { + "epoch": 0.45744816181147435, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.7825795412063599, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8668674826622009, + "num_tokens": 137245288.0, + "step": 3596 + }, + { + "epoch": 0.4575753720900649, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.58689284324646, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8538562059402466, + "num_tokens": 137283251.0, + "step": 3597 + }, + { + "epoch": 0.4577025823686554, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.538659691810608, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8659063577651978, + "num_tokens": 137321739.0, + "step": 3598 + }, + { + "epoch": 0.4578297926472459, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5675405263900757, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8725700974464417, + "num_tokens": 137360356.0, + "step": 3599 + }, + { + "epoch": 0.4579570029258364, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5366621017456055, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8560829162597656, + "num_tokens": 137398988.0, + "step": 3600 + }, + { + "epoch": 0.45808421320442694, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.4700981378555298, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8764568567276001, + "num_tokens": 137439579.0, + "step": 3601 + }, + { + "epoch": 0.4582114234830174, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5477538108825684, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8661941885948181, + "num_tokens": 137477807.0, + "step": 3602 + }, + { + "epoch": 0.45833863376160794, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.6846925020217896, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8485233783721924, + "num_tokens": 137516127.0, + "step": 3603 + }, + { + "epoch": 0.45846584404019847, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.6706138849258423, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8474143743515015, + "num_tokens": 137556297.0, + "step": 3604 + }, + { + "epoch": 0.45859305431878894, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.540088415145874, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.860814094543457, + "num_tokens": 137599396.0, + "step": 3605 + }, + { + "epoch": 0.45872026459737947, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.6007822751998901, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.850308358669281, + "num_tokens": 137638573.0, + "step": 3606 + }, + { + "epoch": 0.45884747487597, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.6439470052719116, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.853268563747406, + "num_tokens": 137677304.0, + "step": 3607 + }, + { + "epoch": 0.45897468515456047, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5324779748916626, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8776109218597412, + "num_tokens": 137715143.0, + "step": 3608 + }, + { + "epoch": 0.459101895433151, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.7190371751785278, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8473881483078003, + "num_tokens": 137752858.0, + "step": 3609 + }, + { + "epoch": 0.4592291057117415, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5028952360153198, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8698035478591919, + "num_tokens": 137794560.0, + "step": 3610 + }, + { + "epoch": 0.459356315990332, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5778672695159912, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8625913858413696, + "num_tokens": 137835870.0, + "step": 3611 + }, + { + "epoch": 0.4594835262689225, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5320245027542114, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8370789289474487, + "num_tokens": 137880619.0, + "step": 3612 + }, + { + "epoch": 0.45961073654751305, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.625266671180725, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8605952858924866, + "num_tokens": 137915139.0, + "step": 3613 + }, + { + "epoch": 0.4597379468261035, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.6776684522628784, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8510745763778687, + "num_tokens": 137951840.0, + "step": 3614 + }, + { + "epoch": 0.45986515710469406, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5649235248565674, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8552918434143066, + "num_tokens": 137990311.0, + "step": 3615 + }, + { + "epoch": 0.4599923673832846, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5277539491653442, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.873200535774231, + "num_tokens": 138025566.0, + "step": 3616 + }, + { + "epoch": 0.46011957766187506, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5056120157241821, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.85915207862854, + "num_tokens": 138067132.0, + "step": 3617 + }, + { + "epoch": 0.4602467879404656, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.6316583156585693, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8664931654930115, + "num_tokens": 138103275.0, + "step": 3618 + }, + { + "epoch": 0.4603739982190561, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5412607192993164, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8699290752410889, + "num_tokens": 138147178.0, + "step": 3619 + }, + { + "epoch": 0.4605012084976466, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.6333987712860107, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8651198148727417, + "num_tokens": 138182390.0, + "step": 3620 + }, + { + "epoch": 0.4606284187762371, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5781824588775635, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.842099130153656, + "num_tokens": 138221581.0, + "step": 3621 + }, + { + "epoch": 0.46075562905482764, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5568275451660156, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8769304752349854, + "num_tokens": 138257970.0, + "step": 3622 + }, + { + "epoch": 0.4608828393334181, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.5184223651885986, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8695452213287354, + "num_tokens": 138297172.0, + "step": 3623 + }, + { + "epoch": 0.46101004961200864, + "ewc_loss": 1.3530254364013672e-05, + "grad_norm": 1.7660415172576904, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8488910794258118, + "num_tokens": 138329739.0, + "step": 3624 + }, + { + "epoch": 0.46113725989059917, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.5815038681030273, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8713719844818115, + "num_tokens": 138371899.0, + "step": 3625 + }, + { + "epoch": 0.46126447016918964, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.5506818294525146, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8563622832298279, + "num_tokens": 138411903.0, + "step": 3626 + }, + { + "epoch": 0.4613916804477802, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.5921283960342407, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8505914807319641, + "num_tokens": 138450307.0, + "step": 3627 + }, + { + "epoch": 0.4615188907263707, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.5085811614990234, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8667128086090088, + "num_tokens": 138487824.0, + "step": 3628 + }, + { + "epoch": 0.4616461010049612, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.6499462127685547, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8440186977386475, + "num_tokens": 138526776.0, + "step": 3629 + }, + { + "epoch": 0.4617733112835517, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.5915262699127197, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8634132146835327, + "num_tokens": 138565769.0, + "step": 3630 + }, + { + "epoch": 0.46190052156214223, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.4632019996643066, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8634817600250244, + "num_tokens": 138607938.0, + "step": 3631 + }, + { + "epoch": 0.46202773184073276, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.660029411315918, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8575927019119263, + "num_tokens": 138643504.0, + "step": 3632 + }, + { + "epoch": 0.46215494211932323, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5121400356292725, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8646332025527954, + "num_tokens": 138685085.0, + "step": 3633 + }, + { + "epoch": 0.46228215239791376, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5702954530715942, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8464473485946655, + "num_tokens": 138726097.0, + "step": 3634 + }, + { + "epoch": 0.4624093626765043, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5541349649429321, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8512145280838013, + "num_tokens": 138767157.0, + "step": 3635 + }, + { + "epoch": 0.46253657295509476, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.6787738800048828, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8485406637191772, + "num_tokens": 138803279.0, + "step": 3636 + }, + { + "epoch": 0.4626637832336853, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.616612434387207, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8612980842590332, + "num_tokens": 138841467.0, + "step": 3637 + }, + { + "epoch": 0.4627909935122758, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.68657648563385, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8588873147964478, + "num_tokens": 138878381.0, + "step": 3638 + }, + { + "epoch": 0.4629182037908663, + "ewc_loss": 1.3589859008789062e-05, + "grad_norm": 1.6361181735992432, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8667084574699402, + "num_tokens": 138915453.0, + "step": 3639 + }, + { + "epoch": 0.4630454140694568, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5517866611480713, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8608413934707642, + "num_tokens": 138953742.0, + "step": 3640 + }, + { + "epoch": 0.46317262434804735, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5247009992599487, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8594962358474731, + "num_tokens": 138994346.0, + "step": 3641 + }, + { + "epoch": 0.4632998346266378, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.563827395439148, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8577886819839478, + "num_tokens": 139036754.0, + "step": 3642 + }, + { + "epoch": 0.46342704490522835, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.6377743482589722, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8605541586875916, + "num_tokens": 139072311.0, + "step": 3643 + }, + { + "epoch": 0.4635542551838189, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.547419548034668, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8488951921463013, + "num_tokens": 139111623.0, + "step": 3644 + }, + { + "epoch": 0.46368146546240935, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.658213496208191, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8689833879470825, + "num_tokens": 139143798.0, + "step": 3645 + }, + { + "epoch": 0.4638086757409999, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.6182235479354858, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8637539744377136, + "num_tokens": 139181484.0, + "step": 3646 + }, + { + "epoch": 0.4639358860195904, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.599048376083374, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8635073900222778, + "num_tokens": 139216190.0, + "step": 3647 + }, + { + "epoch": 0.4640630962981809, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5456626415252686, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8709490299224854, + "num_tokens": 139252374.0, + "step": 3648 + }, + { + "epoch": 0.4641903065767714, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.4959863424301147, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8689630031585693, + "num_tokens": 139295404.0, + "step": 3649 + }, + { + "epoch": 0.46431751685536193, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.6117751598358154, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8663679361343384, + "num_tokens": 139332539.0, + "step": 3650 + }, + { + "epoch": 0.4644447271339524, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.6196211576461792, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8649533987045288, + "num_tokens": 139367219.0, + "step": 3651 + }, + { + "epoch": 0.46457193741254293, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.599603295326233, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.855350136756897, + "num_tokens": 139406478.0, + "step": 3652 + }, + { + "epoch": 0.46469914769113346, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5102074146270752, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8695648908615112, + "num_tokens": 139448983.0, + "step": 3653 + }, + { + "epoch": 0.46482635796972394, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.675066590309143, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8611539006233215, + "num_tokens": 139485691.0, + "step": 3654 + }, + { + "epoch": 0.46495356824831446, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5827845335006714, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8609256148338318, + "num_tokens": 139526424.0, + "step": 3655 + }, + { + "epoch": 0.465080778526905, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.67145836353302, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8606587052345276, + "num_tokens": 139563351.0, + "step": 3656 + }, + { + "epoch": 0.46520798880549546, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.4600872993469238, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8550801277160645, + "num_tokens": 139608857.0, + "step": 3657 + }, + { + "epoch": 0.465335199084086, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.788496732711792, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.847888708114624, + "num_tokens": 139646435.0, + "step": 3658 + }, + { + "epoch": 0.4654624093626765, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.6257681846618652, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8710062503814697, + "num_tokens": 139681652.0, + "step": 3659 + }, + { + "epoch": 0.465589619641267, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.821489691734314, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8690966367721558, + "num_tokens": 139709434.0, + "step": 3660 + }, + { + "epoch": 0.4657168299198575, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5443432331085205, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8602646589279175, + "num_tokens": 139747805.0, + "step": 3661 + }, + { + "epoch": 0.46584404019844805, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5946297645568848, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8495866060256958, + "num_tokens": 139785692.0, + "step": 3662 + }, + { + "epoch": 0.4659712504770385, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.6517786979675293, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8558682203292847, + "num_tokens": 139821462.0, + "step": 3663 + }, + { + "epoch": 0.46609846075562905, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.5799050331115723, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8683043718338013, + "num_tokens": 139857774.0, + "step": 3664 + }, + { + "epoch": 0.4662256710342196, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.526136875152588, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8531760573387146, + "num_tokens": 139902991.0, + "step": 3665 + }, + { + "epoch": 0.46635288131281005, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.4852166175842285, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.878086507320404, + "num_tokens": 139941238.0, + "step": 3666 + }, + { + "epoch": 0.4664800915914006, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.6661038398742676, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8567420244216919, + "num_tokens": 139978895.0, + "step": 3667 + }, + { + "epoch": 0.4666073018699911, + "ewc_loss": 1.3649463653564453e-05, + "grad_norm": 1.6851847171783447, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8532199859619141, + "num_tokens": 140013785.0, + "step": 3668 + }, + { + "epoch": 0.4667345121485816, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.5355665683746338, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8592339158058167, + "num_tokens": 140054553.0, + "step": 3669 + }, + { + "epoch": 0.4668617224271721, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.6039717197418213, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8583575487136841, + "num_tokens": 140092622.0, + "step": 3670 + }, + { + "epoch": 0.46698893270576264, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.699027419090271, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8595876693725586, + "num_tokens": 140127162.0, + "step": 3671 + }, + { + "epoch": 0.4671161429843531, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.740302562713623, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8647616505622864, + "num_tokens": 140162091.0, + "step": 3672 + }, + { + "epoch": 0.46724335326294364, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.5007225275039673, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8545514345169067, + "num_tokens": 140203520.0, + "step": 3673 + }, + { + "epoch": 0.46737056354153417, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.7382416725158691, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8648679256439209, + "num_tokens": 140237923.0, + "step": 3674 + }, + { + "epoch": 0.46749777382012464, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.7166818380355835, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8629422187805176, + "num_tokens": 140272063.0, + "step": 3675 + }, + { + "epoch": 0.46762498409871517, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.6859257221221924, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.861725926399231, + "num_tokens": 140308934.0, + "step": 3676 + }, + { + "epoch": 0.4677521943773057, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.6116188764572144, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8604910373687744, + "num_tokens": 140348872.0, + "step": 3677 + }, + { + "epoch": 0.46787940465589617, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.6756064891815186, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8552759289741516, + "num_tokens": 140385593.0, + "step": 3678 + }, + { + "epoch": 0.4680066149344867, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.6167933940887451, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.859160304069519, + "num_tokens": 140420168.0, + "step": 3679 + }, + { + "epoch": 0.4681338252130772, + "ewc_loss": 1.3709068298339844e-05, + "grad_norm": 1.7596697807312012, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8498457670211792, + "num_tokens": 140454899.0, + "step": 3680 + }, + { + "epoch": 0.46826103549166775, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.8451290130615234, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8595612049102783, + "num_tokens": 140492882.0, + "step": 3681 + }, + { + "epoch": 0.4683882457702582, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.5959876775741577, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8631814122200012, + "num_tokens": 140531659.0, + "step": 3682 + }, + { + "epoch": 0.46851545604884876, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.5388859510421753, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8452889323234558, + "num_tokens": 140571578.0, + "step": 3683 + }, + { + "epoch": 0.4686426663274393, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.7081656455993652, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.856481671333313, + "num_tokens": 140605423.0, + "step": 3684 + }, + { + "epoch": 0.46876987660602976, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.4437779188156128, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8668514490127563, + "num_tokens": 140647284.0, + "step": 3685 + }, + { + "epoch": 0.4688970868846203, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.6415797472000122, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8648970723152161, + "num_tokens": 140682074.0, + "step": 3686 + }, + { + "epoch": 0.4690242971632108, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.476287841796875, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8713926076889038, + "num_tokens": 140720562.0, + "step": 3687 + }, + { + "epoch": 0.4691515074418013, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.7361910343170166, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8732635974884033, + "num_tokens": 140751968.0, + "step": 3688 + }, + { + "epoch": 0.4692787177203918, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.5521376132965088, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8670122027397156, + "num_tokens": 140787836.0, + "step": 3689 + }, + { + "epoch": 0.46940592799898234, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.4517822265625, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8755638599395752, + "num_tokens": 140832076.0, + "step": 3690 + }, + { + "epoch": 0.4695331382775728, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.5761312246322632, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8684441447257996, + "num_tokens": 140869685.0, + "step": 3691 + }, + { + "epoch": 0.46966034855616334, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.6219440698623657, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8595095276832581, + "num_tokens": 140908042.0, + "step": 3692 + }, + { + "epoch": 0.46978755883475387, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.604252815246582, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.848802924156189, + "num_tokens": 140947097.0, + "step": 3693 + }, + { + "epoch": 0.46991476911334434, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.516581654548645, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8679827451705933, + "num_tokens": 140989216.0, + "step": 3694 + }, + { + "epoch": 0.47004197939193487, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.5335121154785156, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8651266098022461, + "num_tokens": 141028973.0, + "step": 3695 + }, + { + "epoch": 0.4701691896705254, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.6134527921676636, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8576068878173828, + "num_tokens": 141065374.0, + "step": 3696 + }, + { + "epoch": 0.4702963999491159, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.4721441268920898, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8616613149642944, + "num_tokens": 141106247.0, + "step": 3697 + }, + { + "epoch": 0.4704236102277064, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.7716132402420044, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8508071899414062, + "num_tokens": 141141074.0, + "step": 3698 + }, + { + "epoch": 0.47055082050629693, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.4440171718597412, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8682229518890381, + "num_tokens": 141185460.0, + "step": 3699 + }, + { + "epoch": 0.4706780307848874, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.7033883333206177, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8596131205558777, + "num_tokens": 141219092.0, + "step": 3700 + }, + { + "epoch": 0.47080524106347793, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.5699818134307861, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8615449666976929, + "num_tokens": 141263714.0, + "step": 3701 + }, + { + "epoch": 0.47093245134206846, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.526413083076477, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8550211191177368, + "num_tokens": 141302140.0, + "step": 3702 + }, + { + "epoch": 0.47105966162065893, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.580142617225647, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.867281973361969, + "num_tokens": 141339742.0, + "step": 3703 + }, + { + "epoch": 0.47118687189924946, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.6307203769683838, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8630040287971497, + "num_tokens": 141379502.0, + "step": 3704 + }, + { + "epoch": 0.47131408217784, + "ewc_loss": 1.3828277587890625e-05, + "grad_norm": 1.677573323249817, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8702859878540039, + "num_tokens": 141410994.0, + "step": 3705 + }, + { + "epoch": 0.47144129245643046, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6141157150268555, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8614242076873779, + "num_tokens": 141444943.0, + "step": 3706 + }, + { + "epoch": 0.471568502735021, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.510064959526062, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8665554523468018, + "num_tokens": 141484778.0, + "step": 3707 + }, + { + "epoch": 0.4716957130136115, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.7503877878189087, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8417733907699585, + "num_tokens": 141519420.0, + "step": 3708 + }, + { + "epoch": 0.471822923292202, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5885487794876099, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8512366414070129, + "num_tokens": 141559680.0, + "step": 3709 + }, + { + "epoch": 0.4719501335707925, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.7767536640167236, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8455942869186401, + "num_tokens": 141592630.0, + "step": 3710 + }, + { + "epoch": 0.47207734384938305, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5458145141601562, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8570711612701416, + "num_tokens": 141632959.0, + "step": 3711 + }, + { + "epoch": 0.4722045541279735, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6253795623779297, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8542279005050659, + "num_tokens": 141671199.0, + "step": 3712 + }, + { + "epoch": 0.47233176440656405, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5776275396347046, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8722230792045593, + "num_tokens": 141709756.0, + "step": 3713 + }, + { + "epoch": 0.4724589746851546, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6746985912322998, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8712834119796753, + "num_tokens": 141748105.0, + "step": 3714 + }, + { + "epoch": 0.47258618496374505, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6252493858337402, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8527495265007019, + "num_tokens": 141785915.0, + "step": 3715 + }, + { + "epoch": 0.4727133952423356, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6767903566360474, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8596780300140381, + "num_tokens": 141827064.0, + "step": 3716 + }, + { + "epoch": 0.4728406055209261, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6044163703918457, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8595209121704102, + "num_tokens": 141868391.0, + "step": 3717 + }, + { + "epoch": 0.4729678157995166, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.4834768772125244, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8496332764625549, + "num_tokens": 141910751.0, + "step": 3718 + }, + { + "epoch": 0.4730950260781071, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.765831470489502, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8598521947860718, + "num_tokens": 141945375.0, + "step": 3719 + }, + { + "epoch": 0.47322223635669763, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.4461398124694824, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.865384578704834, + "num_tokens": 141990647.0, + "step": 3720 + }, + { + "epoch": 0.4733494466352881, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5282613039016724, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8817209005355835, + "num_tokens": 142031729.0, + "step": 3721 + }, + { + "epoch": 0.47347665691387864, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5913434028625488, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8544946908950806, + "num_tokens": 142070287.0, + "step": 3722 + }, + { + "epoch": 0.47360386719246916, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.509547472000122, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.87480628490448, + "num_tokens": 142115778.0, + "step": 3723 + }, + { + "epoch": 0.47373107747105964, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5975379943847656, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8669760227203369, + "num_tokens": 142153836.0, + "step": 3724 + }, + { + "epoch": 0.47385828774965016, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.7272520065307617, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8637552261352539, + "num_tokens": 142193412.0, + "step": 3725 + }, + { + "epoch": 0.4739854980282407, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.545081615447998, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8743304014205933, + "num_tokens": 142231966.0, + "step": 3726 + }, + { + "epoch": 0.47411270830683117, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6111727952957153, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8642212748527527, + "num_tokens": 142268389.0, + "step": 3727 + }, + { + "epoch": 0.4742399185854217, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6740034818649292, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8686773777008057, + "num_tokens": 142303163.0, + "step": 3728 + }, + { + "epoch": 0.4743671288640122, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.7421468496322632, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.864859938621521, + "num_tokens": 142336558.0, + "step": 3729 + }, + { + "epoch": 0.4744943391426027, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6163520812988281, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8676296472549438, + "num_tokens": 142372313.0, + "step": 3730 + }, + { + "epoch": 0.4746215494211932, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5942643880844116, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8694059252738953, + "num_tokens": 142408489.0, + "step": 3731 + }, + { + "epoch": 0.47474875969978375, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.572229027748108, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8729181289672852, + "num_tokens": 142444027.0, + "step": 3732 + }, + { + "epoch": 0.4748759699783743, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.7722569704055786, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8598638772964478, + "num_tokens": 142478702.0, + "step": 3733 + }, + { + "epoch": 0.47500318025696475, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5506278276443481, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8627150654792786, + "num_tokens": 142520641.0, + "step": 3734 + }, + { + "epoch": 0.4751303905355553, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6246730089187622, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8656230568885803, + "num_tokens": 142555806.0, + "step": 3735 + }, + { + "epoch": 0.4752576008141458, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5155242681503296, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8681083917617798, + "num_tokens": 142596727.0, + "step": 3736 + }, + { + "epoch": 0.4753848110927363, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.4302479028701782, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8666888475418091, + "num_tokens": 142638766.0, + "step": 3737 + }, + { + "epoch": 0.4755120213713268, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6542142629623413, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8518403768539429, + "num_tokens": 142676257.0, + "step": 3738 + }, + { + "epoch": 0.47563923164991734, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5810149908065796, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8665364980697632, + "num_tokens": 142718690.0, + "step": 3739 + }, + { + "epoch": 0.4757664419285078, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.4845285415649414, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8738700151443481, + "num_tokens": 142755985.0, + "step": 3740 + }, + { + "epoch": 0.47589365220709834, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6299413442611694, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.856696367263794, + "num_tokens": 142794222.0, + "step": 3741 + }, + { + "epoch": 0.47602086248568887, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6752225160598755, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8511371612548828, + "num_tokens": 142832105.0, + "step": 3742 + }, + { + "epoch": 0.47614807276427934, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.564363718032837, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8713968992233276, + "num_tokens": 142871402.0, + "step": 3743 + }, + { + "epoch": 0.47627528304286987, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5310732126235962, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8751094341278076, + "num_tokens": 142910418.0, + "step": 3744 + }, + { + "epoch": 0.4764024933214604, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.4821233749389648, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8720288276672363, + "num_tokens": 142952505.0, + "step": 3745 + }, + { + "epoch": 0.47652970360005087, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.4643402099609375, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8622954487800598, + "num_tokens": 142996082.0, + "step": 3746 + }, + { + "epoch": 0.4766569138786414, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5124586820602417, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8622860908508301, + "num_tokens": 143037482.0, + "step": 3747 + }, + { + "epoch": 0.4767841241572319, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6964263916015625, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8766292333602905, + "num_tokens": 143069370.0, + "step": 3748 + }, + { + "epoch": 0.4769113344358224, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.5590794086456299, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.852807343006134, + "num_tokens": 143111037.0, + "step": 3749 + }, + { + "epoch": 0.4770385447144129, + "ewc_loss": 1.3887882232666016e-05, + "grad_norm": 1.6687394380569458, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8473449349403381, + "num_tokens": 143147168.0, + "step": 3750 + }, + { + "epoch": 0.47716575499300345, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.441025972366333, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8614134192466736, + "num_tokens": 143191616.0, + "step": 3751 + }, + { + "epoch": 0.4772929652715939, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6537529230117798, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8520883321762085, + "num_tokens": 143230229.0, + "step": 3752 + }, + { + "epoch": 0.47742017555018446, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5214895009994507, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8867146968841553, + "num_tokens": 143267838.0, + "step": 3753 + }, + { + "epoch": 0.477547385828775, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.4894288778305054, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8697853684425354, + "num_tokens": 143309566.0, + "step": 3754 + }, + { + "epoch": 0.47767459610736546, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.7682042121887207, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8537053465843201, + "num_tokens": 143343564.0, + "step": 3755 + }, + { + "epoch": 0.477801806385956, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6250903606414795, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8550129532814026, + "num_tokens": 143388195.0, + "step": 3756 + }, + { + "epoch": 0.4779290166645465, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5858142375946045, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8689317107200623, + "num_tokens": 143425451.0, + "step": 3757 + }, + { + "epoch": 0.478056226943137, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.7619192600250244, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8543335795402527, + "num_tokens": 143459024.0, + "step": 3758 + }, + { + "epoch": 0.4781834372217275, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5625072717666626, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8600444793701172, + "num_tokens": 143506583.0, + "step": 3759 + }, + { + "epoch": 0.47831064750031804, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6890034675598145, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8593621850013733, + "num_tokens": 143546556.0, + "step": 3760 + }, + { + "epoch": 0.4784378577789085, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6882449388504028, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8635100722312927, + "num_tokens": 143579668.0, + "step": 3761 + }, + { + "epoch": 0.47856506805749904, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5201835632324219, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8527239561080933, + "num_tokens": 143622220.0, + "step": 3762 + }, + { + "epoch": 0.47869227833608957, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.7303990125656128, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8588310480117798, + "num_tokens": 143657289.0, + "step": 3763 + }, + { + "epoch": 0.47881948861468004, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.632928490638733, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8650723695755005, + "num_tokens": 143692218.0, + "step": 3764 + }, + { + "epoch": 0.4789466988932706, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5801703929901123, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8743390440940857, + "num_tokens": 143728361.0, + "step": 3765 + }, + { + "epoch": 0.4790739091718611, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6894129514694214, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8579619526863098, + "num_tokens": 143762958.0, + "step": 3766 + }, + { + "epoch": 0.4792011194504516, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5973528623580933, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8570486307144165, + "num_tokens": 143802088.0, + "step": 3767 + }, + { + "epoch": 0.4793283297290421, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.641852617263794, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8310389518737793, + "num_tokens": 143844643.0, + "step": 3768 + }, + { + "epoch": 0.47945554000763263, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.7524093389511108, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8550335168838501, + "num_tokens": 143876781.0, + "step": 3769 + }, + { + "epoch": 0.4795827502862231, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6226418018341064, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8495111465454102, + "num_tokens": 143914987.0, + "step": 3770 + }, + { + "epoch": 0.47970996056481363, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6235417127609253, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.849646806716919, + "num_tokens": 143954108.0, + "step": 3771 + }, + { + "epoch": 0.47983717084340416, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6869518756866455, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8570485711097717, + "num_tokens": 143991200.0, + "step": 3772 + }, + { + "epoch": 0.47996438112199463, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5901745557785034, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8613731861114502, + "num_tokens": 144027477.0, + "step": 3773 + }, + { + "epoch": 0.48009159140058516, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6290513277053833, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8699653148651123, + "num_tokens": 144060554.0, + "step": 3774 + }, + { + "epoch": 0.4802188016791757, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5651212930679321, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8468660116195679, + "num_tokens": 144106153.0, + "step": 3775 + }, + { + "epoch": 0.48034601195776616, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5779539346694946, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8710014820098877, + "num_tokens": 144143062.0, + "step": 3776 + }, + { + "epoch": 0.4804732222363567, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5015931129455566, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8709279298782349, + "num_tokens": 144182100.0, + "step": 3777 + }, + { + "epoch": 0.4806004325149472, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6125924587249756, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8494040966033936, + "num_tokens": 144224021.0, + "step": 3778 + }, + { + "epoch": 0.4807276427935377, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5471339225769043, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8608525991439819, + "num_tokens": 144267633.0, + "step": 3779 + }, + { + "epoch": 0.4808548530721282, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6033613681793213, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8634987473487854, + "num_tokens": 144304289.0, + "step": 3780 + }, + { + "epoch": 0.48098206335071875, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6840919256210327, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8550925850868225, + "num_tokens": 144339822.0, + "step": 3781 + }, + { + "epoch": 0.4811092736293093, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.5232356786727905, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8582713603973389, + "num_tokens": 144381876.0, + "step": 3782 + }, + { + "epoch": 0.48123648390789975, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.6078472137451172, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8480978012084961, + "num_tokens": 144422987.0, + "step": 3783 + }, + { + "epoch": 0.4813636941864903, + "ewc_loss": 1.3947486877441406e-05, + "grad_norm": 1.7437318563461304, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8638418316841125, + "num_tokens": 144452411.0, + "step": 3784 + }, + { + "epoch": 0.4814909044650808, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.5196712017059326, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8705183267593384, + "num_tokens": 144491668.0, + "step": 3785 + }, + { + "epoch": 0.4816181147436713, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.5080695152282715, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8618407845497131, + "num_tokens": 144530710.0, + "step": 3786 + }, + { + "epoch": 0.4817453250222618, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.6467911005020142, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8673115968704224, + "num_tokens": 144569264.0, + "step": 3787 + }, + { + "epoch": 0.48187253530085233, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.618526577949524, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8577326536178589, + "num_tokens": 144607693.0, + "step": 3788 + }, + { + "epoch": 0.4819997455794428, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.494036078453064, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.859627366065979, + "num_tokens": 144654336.0, + "step": 3789 + }, + { + "epoch": 0.48212695585803333, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.6459364891052246, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8579805493354797, + "num_tokens": 144691486.0, + "step": 3790 + }, + { + "epoch": 0.48225416613662386, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.685783863067627, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8474045395851135, + "num_tokens": 144729237.0, + "step": 3791 + }, + { + "epoch": 0.48238137641521434, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.6492851972579956, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8601005673408508, + "num_tokens": 144773499.0, + "step": 3792 + }, + { + "epoch": 0.48250858669380486, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.594286561012268, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8561033010482788, + "num_tokens": 144814520.0, + "step": 3793 + }, + { + "epoch": 0.4826357969723954, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.6754250526428223, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8619078397750854, + "num_tokens": 144848034.0, + "step": 3794 + }, + { + "epoch": 0.48276300725098586, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6810394525527954, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8555724620819092, + "num_tokens": 144883929.0, + "step": 3795 + }, + { + "epoch": 0.4828902175295764, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6648306846618652, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8598698377609253, + "num_tokens": 144919146.0, + "step": 3796 + }, + { + "epoch": 0.4830174278081669, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.8834812641143799, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8597536087036133, + "num_tokens": 144946112.0, + "step": 3797 + }, + { + "epoch": 0.4831446380867574, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.4337278604507446, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8662330508232117, + "num_tokens": 144989842.0, + "step": 3798 + }, + { + "epoch": 0.4832718483653479, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5526609420776367, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8560114502906799, + "num_tokens": 145028902.0, + "step": 3799 + }, + { + "epoch": 0.48339905864393845, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5598615407943726, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8652812838554382, + "num_tokens": 145067242.0, + "step": 3800 + }, + { + "epoch": 0.4835262689225289, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.527735710144043, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8545604348182678, + "num_tokens": 145110183.0, + "step": 3801 + }, + { + "epoch": 0.48365347920111945, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.4548063278198242, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8779151439666748, + "num_tokens": 145148012.0, + "step": 3802 + }, + { + "epoch": 0.48378068947971, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.4565075635910034, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8605377078056335, + "num_tokens": 145192724.0, + "step": 3803 + }, + { + "epoch": 0.48390789975830045, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.538219928741455, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8754155039787292, + "num_tokens": 145229723.0, + "step": 3804 + }, + { + "epoch": 0.484035110036891, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.4188097715377808, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.87518310546875, + "num_tokens": 145270027.0, + "step": 3805 + }, + { + "epoch": 0.4841623203154815, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.6811251640319824, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.861295223236084, + "num_tokens": 145301138.0, + "step": 3806 + }, + { + "epoch": 0.484289530594072, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.537776231765747, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.861751914024353, + "num_tokens": 145347162.0, + "step": 3807 + }, + { + "epoch": 0.4844167408726625, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.5447192192077637, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8706215620040894, + "num_tokens": 145387377.0, + "step": 3808 + }, + { + "epoch": 0.48454395115125304, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.5597875118255615, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8667075634002686, + "num_tokens": 145427504.0, + "step": 3809 + }, + { + "epoch": 0.4846711614298435, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.5402354001998901, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8714339137077332, + "num_tokens": 145465532.0, + "step": 3810 + }, + { + "epoch": 0.48479837170843404, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.708795189857483, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8369208574295044, + "num_tokens": 145505477.0, + "step": 3811 + }, + { + "epoch": 0.48492558198702457, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.676060676574707, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8549582958221436, + "num_tokens": 145540437.0, + "step": 3812 + }, + { + "epoch": 0.48505279226561504, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.5197864770889282, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8649672865867615, + "num_tokens": 145579349.0, + "step": 3813 + }, + { + "epoch": 0.48518000254420557, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.537018895149231, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8621811270713806, + "num_tokens": 145616533.0, + "step": 3814 + }, + { + "epoch": 0.4853072128227961, + "ewc_loss": 1.4007091522216797e-05, + "grad_norm": 1.5205864906311035, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8701648712158203, + "num_tokens": 145655575.0, + "step": 3815 + }, + { + "epoch": 0.48543442310138657, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5856585502624512, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8757465481758118, + "num_tokens": 145691772.0, + "step": 3816 + }, + { + "epoch": 0.4855616333799771, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.7000218629837036, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8475561738014221, + "num_tokens": 145726610.0, + "step": 3817 + }, + { + "epoch": 0.4856888436585676, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6409199237823486, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8541668653488159, + "num_tokens": 145761563.0, + "step": 3818 + }, + { + "epoch": 0.4858160539371581, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6214827299118042, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8471454381942749, + "num_tokens": 145802913.0, + "step": 3819 + }, + { + "epoch": 0.4859432642157486, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.65278959274292, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8597826361656189, + "num_tokens": 145841583.0, + "step": 3820 + }, + { + "epoch": 0.48607047449433916, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5723800659179688, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8654401898384094, + "num_tokens": 145883819.0, + "step": 3821 + }, + { + "epoch": 0.48619768477292963, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.632267713546753, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8411521911621094, + "num_tokens": 145926664.0, + "step": 3822 + }, + { + "epoch": 0.48632489505152016, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5381488800048828, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8585205078125, + "num_tokens": 145964525.0, + "step": 3823 + }, + { + "epoch": 0.4864521053301107, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5091497898101807, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8644509315490723, + "num_tokens": 146005681.0, + "step": 3824 + }, + { + "epoch": 0.48657931560870116, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.61324942111969, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8624367713928223, + "num_tokens": 146041650.0, + "step": 3825 + }, + { + "epoch": 0.4867065258872917, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6160023212432861, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8730252981185913, + "num_tokens": 146076195.0, + "step": 3826 + }, + { + "epoch": 0.4868337361658822, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.4671053886413574, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.874768078327179, + "num_tokens": 146115159.0, + "step": 3827 + }, + { + "epoch": 0.4869609464444727, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5393420457839966, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8817243576049805, + "num_tokens": 146151618.0, + "step": 3828 + }, + { + "epoch": 0.4870881567230632, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5774604082107544, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8704909682273865, + "num_tokens": 146191297.0, + "step": 3829 + }, + { + "epoch": 0.48721536700165374, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.7503324747085571, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8587010502815247, + "num_tokens": 146223837.0, + "step": 3830 + }, + { + "epoch": 0.48734257728024427, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.602057933807373, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8601070642471313, + "num_tokens": 146263546.0, + "step": 3831 + }, + { + "epoch": 0.48746978755883474, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5906054973602295, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8676155805587769, + "num_tokens": 146297925.0, + "step": 3832 + }, + { + "epoch": 0.48759699783742527, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.490396499633789, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8561174869537354, + "num_tokens": 146343334.0, + "step": 3833 + }, + { + "epoch": 0.4877242081160158, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6100244522094727, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8648053407669067, + "num_tokens": 146380848.0, + "step": 3834 + }, + { + "epoch": 0.4878514183946063, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.7568517923355103, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8471089601516724, + "num_tokens": 146416715.0, + "step": 3835 + }, + { + "epoch": 0.4879786286731968, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.7576490640640259, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8598134517669678, + "num_tokens": 146448000.0, + "step": 3836 + }, + { + "epoch": 0.48810583895178733, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6318628787994385, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8802010416984558, + "num_tokens": 146483261.0, + "step": 3837 + }, + { + "epoch": 0.4882330492303778, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5951504707336426, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8462358713150024, + "num_tokens": 146521893.0, + "step": 3838 + }, + { + "epoch": 0.48836025950896833, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.4809232950210571, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8633896112442017, + "num_tokens": 146562247.0, + "step": 3839 + }, + { + "epoch": 0.48848746978755886, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.4893704652786255, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8591021299362183, + "num_tokens": 146603159.0, + "step": 3840 + }, + { + "epoch": 0.48861468006614933, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.635788083076477, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8752707242965698, + "num_tokens": 146640244.0, + "step": 3841 + }, + { + "epoch": 0.48874189034473986, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.7632540464401245, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8622966408729553, + "num_tokens": 146672029.0, + "step": 3842 + }, + { + "epoch": 0.4888691006233304, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.571245789527893, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8649033904075623, + "num_tokens": 146712262.0, + "step": 3843 + }, + { + "epoch": 0.48899631090192086, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.682461142539978, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8573080897331238, + "num_tokens": 146745825.0, + "step": 3844 + }, + { + "epoch": 0.4891235211805114, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.563840389251709, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8697794675827026, + "num_tokens": 146783010.0, + "step": 3845 + }, + { + "epoch": 0.4892507314591019, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6203179359436035, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8637427091598511, + "num_tokens": 146821605.0, + "step": 3846 + }, + { + "epoch": 0.4893779417376924, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.5597708225250244, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8496851325035095, + "num_tokens": 146863169.0, + "step": 3847 + }, + { + "epoch": 0.4895051520162829, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6901510953903198, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8733349442481995, + "num_tokens": 146900715.0, + "step": 3848 + }, + { + "epoch": 0.48963236229487345, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6309555768966675, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8607089519500732, + "num_tokens": 146940976.0, + "step": 3849 + }, + { + "epoch": 0.4897595725734639, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.7267855405807495, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8770180344581604, + "num_tokens": 146974228.0, + "step": 3850 + }, + { + "epoch": 0.48988678285205445, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.602887511253357, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8526250720024109, + "num_tokens": 147016319.0, + "step": 3851 + }, + { + "epoch": 0.490013993130645, + "ewc_loss": 1.4126300811767578e-05, + "grad_norm": 1.6161808967590332, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8673000335693359, + "num_tokens": 147052352.0, + "step": 3852 + }, + { + "epoch": 0.49014120340923545, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.5283132791519165, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8666897416114807, + "num_tokens": 147093000.0, + "step": 3853 + }, + { + "epoch": 0.490268413687826, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6971008777618408, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8462966680526733, + "num_tokens": 147131119.0, + "step": 3854 + }, + { + "epoch": 0.4903956239664165, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.5023573637008667, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8658460378646851, + "num_tokens": 147170447.0, + "step": 3855 + }, + { + "epoch": 0.490522834245007, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.584570050239563, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8556245565414429, + "num_tokens": 147208207.0, + "step": 3856 + }, + { + "epoch": 0.4906500445235975, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6722906827926636, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8775733113288879, + "num_tokens": 147247753.0, + "step": 3857 + }, + { + "epoch": 0.49077725480218803, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6100506782531738, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8702008724212646, + "num_tokens": 147282034.0, + "step": 3858 + }, + { + "epoch": 0.4909044650807785, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.757309913635254, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8455962538719177, + "num_tokens": 147319354.0, + "step": 3859 + }, + { + "epoch": 0.49103167535936904, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6467207670211792, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8734347820281982, + "num_tokens": 147353545.0, + "step": 3860 + }, + { + "epoch": 0.49115888563795956, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.637823462486267, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8465858101844788, + "num_tokens": 147394814.0, + "step": 3861 + }, + { + "epoch": 0.49128609591655004, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.7258306741714478, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8432559370994568, + "num_tokens": 147429851.0, + "step": 3862 + }, + { + "epoch": 0.49141330619514056, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.7115089893341064, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8548281788825989, + "num_tokens": 147471636.0, + "step": 3863 + }, + { + "epoch": 0.4915405164737311, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.643234133720398, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8576931953430176, + "num_tokens": 147508619.0, + "step": 3864 + }, + { + "epoch": 0.49166772675232157, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.5130892992019653, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.871860921382904, + "num_tokens": 147549093.0, + "step": 3865 + }, + { + "epoch": 0.4917949370309121, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6654056310653687, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8521226644515991, + "num_tokens": 147586772.0, + "step": 3866 + }, + { + "epoch": 0.4919221473095026, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.572881817817688, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8715921640396118, + "num_tokens": 147624680.0, + "step": 3867 + }, + { + "epoch": 0.4920493575880931, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.4565168619155884, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8649662137031555, + "num_tokens": 147668461.0, + "step": 3868 + }, + { + "epoch": 0.4921765678666836, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6542032957077026, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8650697469711304, + "num_tokens": 147708140.0, + "step": 3869 + }, + { + "epoch": 0.49230377814527415, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6160955429077148, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8549591898918152, + "num_tokens": 147748500.0, + "step": 3870 + }, + { + "epoch": 0.4924309884238646, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6117992401123047, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8586634397506714, + "num_tokens": 147782396.0, + "step": 3871 + }, + { + "epoch": 0.49255819870245515, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6978563070297241, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8513889908790588, + "num_tokens": 147819280.0, + "step": 3872 + }, + { + "epoch": 0.4926854089810457, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6035090684890747, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8587883710861206, + "num_tokens": 147860099.0, + "step": 3873 + }, + { + "epoch": 0.49281261925963615, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.5348050594329834, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8645704984664917, + "num_tokens": 147904197.0, + "step": 3874 + }, + { + "epoch": 0.4929398295382267, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.6484580039978027, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.860598087310791, + "num_tokens": 147940586.0, + "step": 3875 + }, + { + "epoch": 0.4930670398168172, + "ewc_loss": 1.4185905456542969e-05, + "grad_norm": 1.7270668745040894, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8674062490463257, + "num_tokens": 147977785.0, + "step": 3876 + }, + { + "epoch": 0.4931942500954077, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6449915170669556, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8449023365974426, + "num_tokens": 148017034.0, + "step": 3877 + }, + { + "epoch": 0.4933214603739982, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5895500183105469, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8754825592041016, + "num_tokens": 148053967.0, + "step": 3878 + }, + { + "epoch": 0.49344867065258874, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5941849946975708, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8670226335525513, + "num_tokens": 148091991.0, + "step": 3879 + }, + { + "epoch": 0.4935758809311792, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.7129946947097778, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8512281775474548, + "num_tokens": 148132512.0, + "step": 3880 + }, + { + "epoch": 0.49370309120976974, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6707490682601929, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8500944972038269, + "num_tokens": 148171183.0, + "step": 3881 + }, + { + "epoch": 0.49383030148836027, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.88302743434906, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8483024835586548, + "num_tokens": 148202746.0, + "step": 3882 + }, + { + "epoch": 0.4939575117669508, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 2.0839014053344727, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8555929064750671, + "num_tokens": 148236152.0, + "step": 3883 + }, + { + "epoch": 0.49408472204554127, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.4952279329299927, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8607913255691528, + "num_tokens": 148280558.0, + "step": 3884 + }, + { + "epoch": 0.4942119323241318, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.8461064100265503, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8521197438240051, + "num_tokens": 148311480.0, + "step": 3885 + }, + { + "epoch": 0.4943391426027223, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6038326025009155, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.848833441734314, + "num_tokens": 148354564.0, + "step": 3886 + }, + { + "epoch": 0.4944663528813128, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6488819122314453, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.871178388595581, + "num_tokens": 148387901.0, + "step": 3887 + }, + { + "epoch": 0.4945935631599033, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.669227123260498, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8676554560661316, + "num_tokens": 148424783.0, + "step": 3888 + }, + { + "epoch": 0.49472077343849385, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.4936397075653076, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8619430661201477, + "num_tokens": 148465250.0, + "step": 3889 + }, + { + "epoch": 0.4948479837170843, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.8157954216003418, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8584276437759399, + "num_tokens": 148498822.0, + "step": 3890 + }, + { + "epoch": 0.49497519399567486, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6551897525787354, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8731633424758911, + "num_tokens": 148536986.0, + "step": 3891 + }, + { + "epoch": 0.4951024042742654, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5648061037063599, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8659238219261169, + "num_tokens": 148576618.0, + "step": 3892 + }, + { + "epoch": 0.49522961455285586, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5864570140838623, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8582310676574707, + "num_tokens": 148616971.0, + "step": 3893 + }, + { + "epoch": 0.4953568248314464, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 3.7831315994262695, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8662739992141724, + "num_tokens": 148649013.0, + "step": 3894 + }, + { + "epoch": 0.4954840351100369, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.7117300033569336, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8687756061553955, + "num_tokens": 148682592.0, + "step": 3895 + }, + { + "epoch": 0.4956112453886274, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.513899326324463, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8793672323226929, + "num_tokens": 148718673.0, + "step": 3896 + }, + { + "epoch": 0.4957384556672179, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6652438640594482, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8738935589790344, + "num_tokens": 148752201.0, + "step": 3897 + }, + { + "epoch": 0.49586566594580844, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.4925243854522705, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.867674708366394, + "num_tokens": 148790926.0, + "step": 3898 + }, + { + "epoch": 0.4959928762243989, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5810431241989136, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8571289777755737, + "num_tokens": 148833368.0, + "step": 3899 + }, + { + "epoch": 0.49612008650298944, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.4998127222061157, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8673468232154846, + "num_tokens": 148874497.0, + "step": 3900 + }, + { + "epoch": 0.49624729678157997, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6844068765640259, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8515265583992004, + "num_tokens": 148916557.0, + "step": 3901 + }, + { + "epoch": 0.49637450706017044, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.784429907798767, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8583815693855286, + "num_tokens": 148952144.0, + "step": 3902 + }, + { + "epoch": 0.496501717338761, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5436378717422485, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8515654802322388, + "num_tokens": 148994164.0, + "step": 3903 + }, + { + "epoch": 0.4966289276173515, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6228699684143066, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8640061616897583, + "num_tokens": 149033066.0, + "step": 3904 + }, + { + "epoch": 0.496756137895942, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5166220664978027, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8369815349578857, + "num_tokens": 149079116.0, + "step": 3905 + }, + { + "epoch": 0.4968833481745325, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.7563464641571045, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8728247880935669, + "num_tokens": 149110697.0, + "step": 3906 + }, + { + "epoch": 0.49701055845312303, + "ewc_loss": 1.424551010131836e-05, + "grad_norm": 1.6102215051651, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8471630811691284, + "num_tokens": 149150269.0, + "step": 3907 + }, + { + "epoch": 0.4971377687317135, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.595656156539917, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.873573362827301, + "num_tokens": 149184259.0, + "step": 3908 + }, + { + "epoch": 0.49726497901030403, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6663761138916016, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8460093140602112, + "num_tokens": 149222573.0, + "step": 3909 + }, + { + "epoch": 0.49739218928889456, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.7208573818206787, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.86017245054245, + "num_tokens": 149259047.0, + "step": 3910 + }, + { + "epoch": 0.49751939956748503, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6629881858825684, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8550560474395752, + "num_tokens": 149295006.0, + "step": 3911 + }, + { + "epoch": 0.49764660984607556, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.7201913595199585, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8637998104095459, + "num_tokens": 149327654.0, + "step": 3912 + }, + { + "epoch": 0.4977738201246661, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5893137454986572, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8734655380249023, + "num_tokens": 149366201.0, + "step": 3913 + }, + { + "epoch": 0.49790103040325656, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.692542314529419, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8531073331832886, + "num_tokens": 149400982.0, + "step": 3914 + }, + { + "epoch": 0.4980282406818471, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5864158868789673, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8582843542098999, + "num_tokens": 149440689.0, + "step": 3915 + }, + { + "epoch": 0.4981554509604376, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5795131921768188, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8611846566200256, + "num_tokens": 149479775.0, + "step": 3916 + }, + { + "epoch": 0.4982826612390281, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.677970290184021, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8707132935523987, + "num_tokens": 149515477.0, + "step": 3917 + }, + { + "epoch": 0.4984098715176186, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.597429871559143, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8811129331588745, + "num_tokens": 149550319.0, + "step": 3918 + }, + { + "epoch": 0.49853708179620915, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.605029582977295, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8571653366088867, + "num_tokens": 149588237.0, + "step": 3919 + }, + { + "epoch": 0.4986642920747996, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5521869659423828, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8726974725723267, + "num_tokens": 149627015.0, + "step": 3920 + }, + { + "epoch": 0.49879150235339015, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.7246019840240479, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8578106164932251, + "num_tokens": 149661540.0, + "step": 3921 + }, + { + "epoch": 0.4989187126319807, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.727157473564148, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8606919050216675, + "num_tokens": 149698532.0, + "step": 3922 + }, + { + "epoch": 0.49904592291057115, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.64255952835083, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8437823057174683, + "num_tokens": 149739082.0, + "step": 3923 + }, + { + "epoch": 0.4991731331891617, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5736339092254639, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8478760123252869, + "num_tokens": 149780034.0, + "step": 3924 + }, + { + "epoch": 0.4993003434677522, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5052810907363892, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8547346591949463, + "num_tokens": 149819707.0, + "step": 3925 + }, + { + "epoch": 0.4994275537463427, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6990725994110107, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8558268547058105, + "num_tokens": 149860802.0, + "step": 3926 + }, + { + "epoch": 0.4995547640249332, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5156056880950928, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8640127182006836, + "num_tokens": 149903554.0, + "step": 3927 + }, + { + "epoch": 0.49968197430352373, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5309957265853882, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8596581220626831, + "num_tokens": 149944313.0, + "step": 3928 + }, + { + "epoch": 0.4998091845821142, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6982436180114746, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.866783618927002, + "num_tokens": 149978190.0, + "step": 3929 + }, + { + "epoch": 0.49993639486070474, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6313427686691284, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8451578617095947, + "num_tokens": 150017505.0, + "step": 3930 + }, + { + "epoch": 0.5000636051392953, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.550819993019104, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8614141941070557, + "num_tokens": 150059192.0, + "step": 3931 + }, + { + "epoch": 0.5001908154178858, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5800106525421143, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8574532270431519, + "num_tokens": 150099889.0, + "step": 3932 + }, + { + "epoch": 0.5003180256964763, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6013723611831665, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8582301139831543, + "num_tokens": 150139251.0, + "step": 3933 + }, + { + "epoch": 0.5004452359750667, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.5934432744979858, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8478793501853943, + "num_tokens": 150181983.0, + "step": 3934 + }, + { + "epoch": 0.5005724462536573, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.9391791820526123, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8778142333030701, + "num_tokens": 150219948.0, + "step": 3935 + }, + { + "epoch": 0.5006996565322478, + "ewc_loss": 1.430511474609375e-05, + "grad_norm": 1.6192495822906494, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8725284934043884, + "num_tokens": 150255630.0, + "step": 3936 + }, + { + "epoch": 0.5008268668108383, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.8396341800689697, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8456563949584961, + "num_tokens": 150289141.0, + "step": 3937 + }, + { + "epoch": 0.5009540770894289, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6789908409118652, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.864245593547821, + "num_tokens": 150324818.0, + "step": 3938 + }, + { + "epoch": 0.5010812873680194, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.723349690437317, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8537184000015259, + "num_tokens": 150358558.0, + "step": 3939 + }, + { + "epoch": 0.5012084976466098, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5720198154449463, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8594903945922852, + "num_tokens": 150399808.0, + "step": 3940 + }, + { + "epoch": 0.5013357079252003, + "ewc_loss": 1.436471939086914e-05, + "grad_norm": 1.6485682725906372, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.852476954460144, + "num_tokens": 150438652.0, + "step": 3941 + }, + { + "epoch": 0.5014629182037909, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5507526397705078, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8607538342475891, + "num_tokens": 150482286.0, + "step": 3942 + }, + { + "epoch": 0.5015901284823814, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.585602879524231, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8396878242492676, + "num_tokens": 150524640.0, + "step": 3943 + }, + { + "epoch": 0.5017173387609719, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.66594398021698, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8653436303138733, + "num_tokens": 150563920.0, + "step": 3944 + }, + { + "epoch": 0.5018445490395624, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.7047386169433594, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8404325246810913, + "num_tokens": 150602712.0, + "step": 3945 + }, + { + "epoch": 0.5019717593181529, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.4692139625549316, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8719286918640137, + "num_tokens": 150644190.0, + "step": 3946 + }, + { + "epoch": 0.5020989695967434, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.696940541267395, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8685379028320312, + "num_tokens": 150674369.0, + "step": 3947 + }, + { + "epoch": 0.5022261798753339, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5011465549468994, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.863898515701294, + "num_tokens": 150715530.0, + "step": 3948 + }, + { + "epoch": 0.5023533901539244, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6878488063812256, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8636993169784546, + "num_tokens": 150747896.0, + "step": 3949 + }, + { + "epoch": 0.502480600432515, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5455365180969238, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8564547300338745, + "num_tokens": 150786419.0, + "step": 3950 + }, + { + "epoch": 0.5026078107111055, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.7015224695205688, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8490225076675415, + "num_tokens": 150819350.0, + "step": 3951 + }, + { + "epoch": 0.5027350209896959, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.9016557931900024, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8536989092826843, + "num_tokens": 150849879.0, + "step": 3952 + }, + { + "epoch": 0.5028622312682864, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5682153701782227, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8626940250396729, + "num_tokens": 150890653.0, + "step": 3953 + }, + { + "epoch": 0.502989441546877, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6122424602508545, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8643741607666016, + "num_tokens": 150925708.0, + "step": 3954 + }, + { + "epoch": 0.5031166518254675, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5343146324157715, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8625597357749939, + "num_tokens": 150967402.0, + "step": 3955 + }, + { + "epoch": 0.503243862104058, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6146658658981323, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.860900342464447, + "num_tokens": 151005814.0, + "step": 3956 + }, + { + "epoch": 0.5033710723826486, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.4703224897384644, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8771469593048096, + "num_tokens": 151050097.0, + "step": 3957 + }, + { + "epoch": 0.5034982826612391, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.7466580867767334, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8673499226570129, + "num_tokens": 151079239.0, + "step": 3958 + }, + { + "epoch": 0.5036254929398295, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5678179264068604, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8493306636810303, + "num_tokens": 151119588.0, + "step": 3959 + }, + { + "epoch": 0.50375270321842, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.7765203714370728, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8447245955467224, + "num_tokens": 151154382.0, + "step": 3960 + }, + { + "epoch": 0.5038799134970106, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5213854312896729, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8583236932754517, + "num_tokens": 151196461.0, + "step": 3961 + }, + { + "epoch": 0.5040071237756011, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.7076072692871094, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.868252694606781, + "num_tokens": 151234058.0, + "step": 3962 + }, + { + "epoch": 0.5041343340541916, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.630680799484253, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8531568050384521, + "num_tokens": 151270278.0, + "step": 3963 + }, + { + "epoch": 0.5042615443327821, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.604585886001587, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8571658730506897, + "num_tokens": 151310966.0, + "step": 3964 + }, + { + "epoch": 0.5043887546113726, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6548112630844116, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8497356176376343, + "num_tokens": 151349864.0, + "step": 3965 + }, + { + "epoch": 0.5045159648899631, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5757161378860474, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8564766049385071, + "num_tokens": 151389465.0, + "step": 3966 + }, + { + "epoch": 0.5046431751685536, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.464147686958313, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8811527490615845, + "num_tokens": 151430492.0, + "step": 3967 + }, + { + "epoch": 0.5047703854471441, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.4836326837539673, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8596146106719971, + "num_tokens": 151473635.0, + "step": 3968 + }, + { + "epoch": 0.5048975957257347, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5796688795089722, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8666893839836121, + "num_tokens": 151509330.0, + "step": 3969 + }, + { + "epoch": 0.5050248060043252, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.7139338254928589, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8609058856964111, + "num_tokens": 151541480.0, + "step": 3970 + }, + { + "epoch": 0.5051520162829156, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.723523497581482, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8697413206100464, + "num_tokens": 151576787.0, + "step": 3971 + }, + { + "epoch": 0.5052792265615061, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6363790035247803, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8800472617149353, + "num_tokens": 151616753.0, + "step": 3972 + }, + { + "epoch": 0.5054064368400967, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.7610479593276978, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8434977531433105, + "num_tokens": 151655313.0, + "step": 3973 + }, + { + "epoch": 0.5055336471186872, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.8576442003250122, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8483392000198364, + "num_tokens": 151692992.0, + "step": 3974 + }, + { + "epoch": 0.5056608573972777, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5530755519866943, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.864495038986206, + "num_tokens": 151732305.0, + "step": 3975 + }, + { + "epoch": 0.5057880676758683, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.933465838432312, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8607561588287354, + "num_tokens": 151768167.0, + "step": 3976 + }, + { + "epoch": 0.5059152779544587, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.6992756128311157, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8590233325958252, + "num_tokens": 151802016.0, + "step": 3977 + }, + { + "epoch": 0.5060424882330492, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.590967059135437, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8677065372467041, + "num_tokens": 151841015.0, + "step": 3978 + }, + { + "epoch": 0.5061696985116397, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5552141666412354, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8463509678840637, + "num_tokens": 151884185.0, + "step": 3979 + }, + { + "epoch": 0.5062969087902303, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5873602628707886, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8497179746627808, + "num_tokens": 151923127.0, + "step": 3980 + }, + { + "epoch": 0.5064241190688208, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5569819211959839, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8574449419975281, + "num_tokens": 151963891.0, + "step": 3981 + }, + { + "epoch": 0.5065513293474113, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6288528442382812, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8477919697761536, + "num_tokens": 152001691.0, + "step": 3982 + }, + { + "epoch": 0.5066785396260017, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.517027735710144, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8701288104057312, + "num_tokens": 152041117.0, + "step": 3983 + }, + { + "epoch": 0.5068057499045923, + "ewc_loss": 1.4424324035644531e-05, + "grad_norm": 1.5344722270965576, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8610532283782959, + "num_tokens": 152081064.0, + "step": 3984 + }, + { + "epoch": 0.5069329601831828, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.474805474281311, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8612381219863892, + "num_tokens": 152122378.0, + "step": 3985 + }, + { + "epoch": 0.5070601704617733, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5865870714187622, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8659847974777222, + "num_tokens": 152161017.0, + "step": 3986 + }, + { + "epoch": 0.5071873807403638, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 1.7414194345474243, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8616911172866821, + "num_tokens": 152201073.0, + "step": 3987 + }, + { + "epoch": 0.5073145910189544, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5578696727752686, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8470491170883179, + "num_tokens": 152243140.0, + "step": 3988 + }, + { + "epoch": 0.5074418012975448, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5308541059494019, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8654248714447021, + "num_tokens": 152282848.0, + "step": 3989 + }, + { + "epoch": 0.5075690115761353, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5673179626464844, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8519574403762817, + "num_tokens": 152323676.0, + "step": 3990 + }, + { + "epoch": 0.5076962218547258, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 4.614395618438721, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8642870187759399, + "num_tokens": 152359692.0, + "step": 3991 + }, + { + "epoch": 0.5078234321333164, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.277376890182495, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8705323934555054, + "num_tokens": 152394727.0, + "step": 3992 + }, + { + "epoch": 0.5079506424119069, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6232774257659912, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.874650239944458, + "num_tokens": 152430888.0, + "step": 3993 + }, + { + "epoch": 0.5080778526904974, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.4570231437683105, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8572784066200256, + "num_tokens": 152476688.0, + "step": 3994 + }, + { + "epoch": 0.5082050629690879, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6499934196472168, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.871638298034668, + "num_tokens": 152514415.0, + "step": 3995 + }, + { + "epoch": 0.5083322732476784, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5605651140213013, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.870453953742981, + "num_tokens": 152552255.0, + "step": 3996 + }, + { + "epoch": 0.5084594835262689, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.653203010559082, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8556933403015137, + "num_tokens": 152590632.0, + "step": 3997 + }, + { + "epoch": 0.5085866938048594, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6734484434127808, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8663098216056824, + "num_tokens": 152622945.0, + "step": 3998 + }, + { + "epoch": 0.50871390408345, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5619361400604248, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8568715453147888, + "num_tokens": 152660793.0, + "step": 3999 + }, + { + "epoch": 0.5088411143620405, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6590945720672607, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8719549775123596, + "num_tokens": 152693929.0, + "step": 4000 + }, + { + "epoch": 0.5089683246406309, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.709714651107788, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8663646578788757, + "num_tokens": 152729492.0, + "step": 4001 + }, + { + "epoch": 0.5090955349192214, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.633798599243164, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8642210364341736, + "num_tokens": 152765898.0, + "step": 4002 + }, + { + "epoch": 0.509222745197812, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.613351583480835, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8477970957756042, + "num_tokens": 152805059.0, + "step": 4003 + }, + { + "epoch": 0.5093499554764025, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5392488241195679, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.863409161567688, + "num_tokens": 152843696.0, + "step": 4004 + }, + { + "epoch": 0.509477165754993, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5059030055999756, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8586799502372742, + "num_tokens": 152884986.0, + "step": 4005 + }, + { + "epoch": 0.5096043760335836, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5111509561538696, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8574666976928711, + "num_tokens": 152925945.0, + "step": 4006 + }, + { + "epoch": 0.5097315863121741, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5138894319534302, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8504934310913086, + "num_tokens": 152972120.0, + "step": 4007 + }, + { + "epoch": 0.5098587965907645, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5087101459503174, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8528537750244141, + "num_tokens": 153012507.0, + "step": 4008 + }, + { + "epoch": 0.509986006869355, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.521424651145935, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.866213858127594, + "num_tokens": 153051643.0, + "step": 4009 + }, + { + "epoch": 0.5101132171479456, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5320452451705933, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8654576539993286, + "num_tokens": 153089150.0, + "step": 4010 + }, + { + "epoch": 0.5102404274265361, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6126588582992554, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8617212176322937, + "num_tokens": 153130048.0, + "step": 4011 + }, + { + "epoch": 0.5103676377051266, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.627623200416565, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8649733066558838, + "num_tokens": 153167344.0, + "step": 4012 + }, + { + "epoch": 0.5104948479837171, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.7118525505065918, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8418575525283813, + "num_tokens": 153206055.0, + "step": 4013 + }, + { + "epoch": 0.5106220582623076, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.718120813369751, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.869015097618103, + "num_tokens": 153240573.0, + "step": 4014 + }, + { + "epoch": 0.5107492685408981, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5288827419281006, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8580628633499146, + "num_tokens": 153281041.0, + "step": 4015 + }, + { + "epoch": 0.5108764788194886, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6933289766311646, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8671669960021973, + "num_tokens": 153319615.0, + "step": 4016 + }, + { + "epoch": 0.5110036890980791, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.626534342765808, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8585553169250488, + "num_tokens": 153355570.0, + "step": 4017 + }, + { + "epoch": 0.5111308993766697, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.5978354215621948, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8692981004714966, + "num_tokens": 153396242.0, + "step": 4018 + }, + { + "epoch": 0.5112581096552602, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.8251564502716064, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8506669998168945, + "num_tokens": 153432476.0, + "step": 4019 + }, + { + "epoch": 0.5113853199338506, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.7051749229431152, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8764487504959106, + "num_tokens": 153466196.0, + "step": 4020 + }, + { + "epoch": 0.5115125302124411, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.522287130355835, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.860865592956543, + "num_tokens": 153507406.0, + "step": 4021 + }, + { + "epoch": 0.5116397404910317, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6288481950759888, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8643837571144104, + "num_tokens": 153542931.0, + "step": 4022 + }, + { + "epoch": 0.5117669507696222, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6077042818069458, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8457556366920471, + "num_tokens": 153581904.0, + "step": 4023 + }, + { + "epoch": 0.5118941610482127, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.626379370689392, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8557250499725342, + "num_tokens": 153623607.0, + "step": 4024 + }, + { + "epoch": 0.5120213713268033, + "ewc_loss": 1.4483928680419922e-05, + "grad_norm": 1.6596537828445435, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8650029301643372, + "num_tokens": 153660686.0, + "step": 4025 + }, + { + "epoch": 0.5121485816053937, + "ewc_loss": 1.4543533325195312e-05, + "grad_norm": 2.3734965324401855, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.872964084148407, + "num_tokens": 153692194.0, + "step": 4026 + }, + { + "epoch": 0.5122757918839842, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.5762985944747925, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8623307347297668, + "num_tokens": 153736962.0, + "step": 4027 + }, + { + "epoch": 0.5124030021625747, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6337103843688965, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8828967213630676, + "num_tokens": 153773802.0, + "step": 4028 + }, + { + "epoch": 0.5125302124411653, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.5962110757827759, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8653403520584106, + "num_tokens": 153812645.0, + "step": 4029 + }, + { + "epoch": 0.5126574227197558, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.6512503623962402, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.864111065864563, + "num_tokens": 153847934.0, + "step": 4030 + }, + { + "epoch": 0.5127846329983463, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6034343242645264, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8599647283554077, + "num_tokens": 153887770.0, + "step": 4031 + }, + { + "epoch": 0.5129118432769367, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.8154029846191406, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8674049973487854, + "num_tokens": 153920330.0, + "step": 4032 + }, + { + "epoch": 0.5130390535555273, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.6737587451934814, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8701496720314026, + "num_tokens": 153957552.0, + "step": 4033 + }, + { + "epoch": 0.5131662638341178, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.6726332902908325, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8687387108802795, + "num_tokens": 153990602.0, + "step": 4034 + }, + { + "epoch": 0.5132934741127083, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.546354055404663, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.875299334526062, + "num_tokens": 154029048.0, + "step": 4035 + }, + { + "epoch": 0.5134206843912988, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.5792551040649414, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.871163547039032, + "num_tokens": 154064304.0, + "step": 4036 + }, + { + "epoch": 0.5135478946698894, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.49278724193573, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.86543869972229, + "num_tokens": 154106304.0, + "step": 4037 + }, + { + "epoch": 0.5136751049484798, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.3984590768814087, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8665581941604614, + "num_tokens": 154152333.0, + "step": 4038 + }, + { + "epoch": 0.5138023152270703, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.6051790714263916, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8520808219909668, + "num_tokens": 154195278.0, + "step": 4039 + }, + { + "epoch": 0.5139295255056608, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6072982549667358, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8500349521636963, + "num_tokens": 154239295.0, + "step": 4040 + }, + { + "epoch": 0.5140567357842514, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6097214221954346, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8703994750976562, + "num_tokens": 154277675.0, + "step": 4041 + }, + { + "epoch": 0.5141839460628419, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.7369213104248047, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8544405698776245, + "num_tokens": 154314124.0, + "step": 4042 + }, + { + "epoch": 0.5143111563414324, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.6254687309265137, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8473716974258423, + "num_tokens": 154350857.0, + "step": 4043 + }, + { + "epoch": 0.5144383666200228, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.6540143489837646, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8692997694015503, + "num_tokens": 154387736.0, + "step": 4044 + }, + { + "epoch": 0.5145655768986134, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.7003841400146484, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8558509349822998, + "num_tokens": 154421669.0, + "step": 4045 + }, + { + "epoch": 0.5146927871772039, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.6630334854125977, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8689308762550354, + "num_tokens": 154459163.0, + "step": 4046 + }, + { + "epoch": 0.5148199974557944, + "ewc_loss": 1.4603137969970703e-05, + "grad_norm": 1.7681119441986084, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8499476909637451, + "num_tokens": 154496810.0, + "step": 4047 + }, + { + "epoch": 0.514947207734385, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.7039040327072144, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.849811315536499, + "num_tokens": 154533856.0, + "step": 4048 + }, + { + "epoch": 0.5150744180129755, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.7847678661346436, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8601616024971008, + "num_tokens": 154568705.0, + "step": 4049 + }, + { + "epoch": 0.5152016282915659, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6864768266677856, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8575962781906128, + "num_tokens": 154603875.0, + "step": 4050 + }, + { + "epoch": 0.5153288385701564, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.5241467952728271, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8537172079086304, + "num_tokens": 154648481.0, + "step": 4051 + }, + { + "epoch": 0.515456048848747, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.632218360900879, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8709357976913452, + "num_tokens": 154686960.0, + "step": 4052 + }, + { + "epoch": 0.5155832591273375, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.7677364349365234, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8753719329833984, + "num_tokens": 154721340.0, + "step": 4053 + }, + { + "epoch": 0.515710469405928, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.7871925830841064, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8555974364280701, + "num_tokens": 154753554.0, + "step": 4054 + }, + { + "epoch": 0.5158376796845185, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.5757057666778564, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8537856340408325, + "num_tokens": 154795159.0, + "step": 4055 + }, + { + "epoch": 0.5159648899631091, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 3.64821720123291, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8558314442634583, + "num_tokens": 154839905.0, + "step": 4056 + }, + { + "epoch": 0.5160921002416995, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.605656623840332, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8679955005645752, + "num_tokens": 154878826.0, + "step": 4057 + }, + { + "epoch": 0.51621931052029, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6164398193359375, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8582648634910583, + "num_tokens": 154918968.0, + "step": 4058 + }, + { + "epoch": 0.5163465207988805, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.7855634689331055, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8551672697067261, + "num_tokens": 154955058.0, + "step": 4059 + }, + { + "epoch": 0.5164737310774711, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.5546469688415527, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8733940124511719, + "num_tokens": 154993700.0, + "step": 4060 + }, + { + "epoch": 0.5166009413560616, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.7281867265701294, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8525387048721313, + "num_tokens": 155031445.0, + "step": 4061 + }, + { + "epoch": 0.5167281516346521, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.635136604309082, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8670781254768372, + "num_tokens": 155066551.0, + "step": 4062 + }, + { + "epoch": 0.5168553619132426, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.565866470336914, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8649179935455322, + "num_tokens": 155106465.0, + "step": 4063 + }, + { + "epoch": 0.5169825721918331, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6113457679748535, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8609161376953125, + "num_tokens": 155143543.0, + "step": 4064 + }, + { + "epoch": 0.5171097824704236, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.538203477859497, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8611916303634644, + "num_tokens": 155185881.0, + "step": 4065 + }, + { + "epoch": 0.5172369927490141, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.7105371952056885, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8686197996139526, + "num_tokens": 155222147.0, + "step": 4066 + }, + { + "epoch": 0.5173642030276047, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6058528423309326, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8637502193450928, + "num_tokens": 155262657.0, + "step": 4067 + }, + { + "epoch": 0.5174914133061952, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.758687973022461, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.856804609298706, + "num_tokens": 155298813.0, + "step": 4068 + }, + { + "epoch": 0.5176186235847856, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.4703776836395264, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8758851289749146, + "num_tokens": 155339036.0, + "step": 4069 + }, + { + "epoch": 0.5177458338633761, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 1.6640655994415283, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.859539270401001, + "num_tokens": 155376544.0, + "step": 4070 + }, + { + "epoch": 0.5178730441419667, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 1.5342059135437012, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8697018623352051, + "num_tokens": 155414539.0, + "step": 4071 + }, + { + "epoch": 0.5180002544205572, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6771825551986694, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8563929796218872, + "num_tokens": 155449923.0, + "step": 4072 + }, + { + "epoch": 0.5181274646991477, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 1.6009689569473267, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8609659671783447, + "num_tokens": 155488623.0, + "step": 4073 + }, + { + "epoch": 0.5182546749777382, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.4818938970565796, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8744481801986694, + "num_tokens": 155530893.0, + "step": 4074 + }, + { + "epoch": 0.5183818852563287, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6661807298660278, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8480365872383118, + "num_tokens": 155571637.0, + "step": 4075 + }, + { + "epoch": 0.5185090955349192, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6864606142044067, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8522747755050659, + "num_tokens": 155606646.0, + "step": 4076 + }, + { + "epoch": 0.5186363058135097, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.528674840927124, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8514206409454346, + "num_tokens": 155654106.0, + "step": 4077 + }, + { + "epoch": 0.5187635160921003, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.7976938486099243, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.854933500289917, + "num_tokens": 155688175.0, + "step": 4078 + }, + { + "epoch": 0.5188907263706908, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6537843942642212, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.858685314655304, + "num_tokens": 155722029.0, + "step": 4079 + }, + { + "epoch": 0.5190179366492813, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.4899569749832153, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8656727075576782, + "num_tokens": 155768641.0, + "step": 4080 + }, + { + "epoch": 0.5191451469278717, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6468801498413086, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8489274978637695, + "num_tokens": 155809459.0, + "step": 4081 + }, + { + "epoch": 0.5192723572064623, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.6746039390563965, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.861235499382019, + "num_tokens": 155848366.0, + "step": 4082 + }, + { + "epoch": 0.5193995674850528, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 1.5783230066299438, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8381967544555664, + "num_tokens": 155889777.0, + "step": 4083 + }, + { + "epoch": 0.5195267777636433, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 1.5958186388015747, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8564537763595581, + "num_tokens": 155934309.0, + "step": 4084 + }, + { + "epoch": 0.5196539880422338, + "ewc_loss": 1.4662742614746094e-05, + "grad_norm": 1.5634979009628296, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8673146367073059, + "num_tokens": 155973267.0, + "step": 4085 + }, + { + "epoch": 0.5197811983208244, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 1.761106252670288, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8628363609313965, + "num_tokens": 156006390.0, + "step": 4086 + }, + { + "epoch": 0.5199084085994148, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 1.611388921737671, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8540936708450317, + "num_tokens": 156045820.0, + "step": 4087 + }, + { + "epoch": 0.5200356188780053, + "ewc_loss": 1.4722347259521484e-05, + "grad_norm": 1.5498731136322021, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8660999536514282, + "num_tokens": 156085269.0, + "step": 4088 + }, + { + "epoch": 0.5201628291565958, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 1.5791741609573364, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8564941883087158, + "num_tokens": 156127891.0, + "step": 4089 + }, + { + "epoch": 0.5202900394351864, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.6995677947998047, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8661263585090637, + "num_tokens": 156161943.0, + "step": 4090 + }, + { + "epoch": 0.5204172497137769, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.4954833984375, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8818036317825317, + "num_tokens": 156199529.0, + "step": 4091 + }, + { + "epoch": 0.5205444599923674, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 1.5099565982818604, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8640451431274414, + "num_tokens": 156242685.0, + "step": 4092 + }, + { + "epoch": 0.5206716702709578, + "ewc_loss": 1.4781951904296875e-05, + "grad_norm": 1.7632548809051514, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8444176912307739, + "num_tokens": 156273419.0, + "step": 4093 + }, + { + "epoch": 0.5207988805495484, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 4.652010440826416, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8514696359634399, + "num_tokens": 156311965.0, + "step": 4094 + }, + { + "epoch": 0.5209260908281389, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.631578803062439, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.853700578212738, + "num_tokens": 156351326.0, + "step": 4095 + }, + { + "epoch": 0.5210533011067294, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.6284061670303345, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8611334562301636, + "num_tokens": 156387379.0, + "step": 4096 + }, + { + "epoch": 0.52118051138532, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.785630702972412, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8627161979675293, + "num_tokens": 156419181.0, + "step": 4097 + }, + { + "epoch": 0.5213077216639105, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.5594606399536133, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8684748411178589, + "num_tokens": 156456665.0, + "step": 4098 + }, + { + "epoch": 0.5214349319425009, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.614980936050415, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8614687919616699, + "num_tokens": 156489560.0, + "step": 4099 + }, + { + "epoch": 0.5215621422210914, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.7439409494400024, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8699848651885986, + "num_tokens": 156521707.0, + "step": 4100 + }, + { + "epoch": 0.521689352499682, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.7072573900222778, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8414171934127808, + "num_tokens": 156557992.0, + "step": 4101 + }, + { + "epoch": 0.5218165627782725, + "ewc_loss": 1.4841556549072266e-05, + "grad_norm": 1.6027162075042725, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.869138240814209, + "num_tokens": 156597664.0, + "step": 4102 + }, + { + "epoch": 0.521943773056863, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6042678356170654, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8607265949249268, + "num_tokens": 156641152.0, + "step": 4103 + }, + { + "epoch": 0.5220709833354535, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6613874435424805, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8726485967636108, + "num_tokens": 156674286.0, + "step": 4104 + }, + { + "epoch": 0.522198193614044, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.52254319190979, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8516331911087036, + "num_tokens": 156718495.0, + "step": 4105 + }, + { + "epoch": 0.5223254038926345, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6598354578018188, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8740031123161316, + "num_tokens": 156753129.0, + "step": 4106 + }, + { + "epoch": 0.522452614171225, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.8095287084579468, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8741573095321655, + "num_tokens": 156784897.0, + "step": 4107 + }, + { + "epoch": 0.5225798244498155, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5964407920837402, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8491262793540955, + "num_tokens": 156826709.0, + "step": 4108 + }, + { + "epoch": 0.5227070347284061, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.65769362449646, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8556597828865051, + "num_tokens": 156864301.0, + "step": 4109 + }, + { + "epoch": 0.5228342450069966, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6461849212646484, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.868951678276062, + "num_tokens": 156901217.0, + "step": 4110 + }, + { + "epoch": 0.5229614552855871, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6201106309890747, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8380453586578369, + "num_tokens": 156945359.0, + "step": 4111 + }, + { + "epoch": 0.5230886655641775, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5514992475509644, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8679797649383545, + "num_tokens": 156982857.0, + "step": 4112 + }, + { + "epoch": 0.5232158758427681, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6116443872451782, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8593195676803589, + "num_tokens": 157020179.0, + "step": 4113 + }, + { + "epoch": 0.5233430861213586, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6429996490478516, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.860357403755188, + "num_tokens": 157055762.0, + "step": 4114 + }, + { + "epoch": 0.5234702963999491, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6083295345306396, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8799288272857666, + "num_tokens": 157089424.0, + "step": 4115 + }, + { + "epoch": 0.5235975066785397, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.4990392923355103, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8494300246238708, + "num_tokens": 157136009.0, + "step": 4116 + }, + { + "epoch": 0.5237247169571302, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6183184385299683, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8528726696968079, + "num_tokens": 157174147.0, + "step": 4117 + }, + { + "epoch": 0.5238519272357206, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5174332857131958, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8683258295059204, + "num_tokens": 157215474.0, + "step": 4118 + }, + { + "epoch": 0.5239791375143111, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.4970752000808716, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8770922422409058, + "num_tokens": 157256390.0, + "step": 4119 + }, + { + "epoch": 0.5241063477929017, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.582694172859192, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8690427541732788, + "num_tokens": 157293221.0, + "step": 4120 + }, + { + "epoch": 0.5242335580714922, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.4772050380706787, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8638834953308105, + "num_tokens": 157335277.0, + "step": 4121 + }, + { + "epoch": 0.5243607683500827, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.626208782196045, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8597840070724487, + "num_tokens": 157372348.0, + "step": 4122 + }, + { + "epoch": 0.5244879786286732, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5537394285202026, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.876044750213623, + "num_tokens": 157406811.0, + "step": 4123 + }, + { + "epoch": 0.5246151889072637, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.460342526435852, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8762984275817871, + "num_tokens": 157447239.0, + "step": 4124 + }, + { + "epoch": 0.5247423991858542, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.521823763847351, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8476451635360718, + "num_tokens": 157493486.0, + "step": 4125 + }, + { + "epoch": 0.5248696094644447, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6116009950637817, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8655604124069214, + "num_tokens": 157529566.0, + "step": 4126 + }, + { + "epoch": 0.5249968197430352, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6714918613433838, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.866945743560791, + "num_tokens": 157563485.0, + "step": 4127 + }, + { + "epoch": 0.5251240300216258, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5972431898117065, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8576902747154236, + "num_tokens": 157603315.0, + "step": 4128 + }, + { + "epoch": 0.5252512403002163, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.46880042552948, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8763822913169861, + "num_tokens": 157644230.0, + "step": 4129 + }, + { + "epoch": 0.5253784505788067, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6489291191101074, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8573721051216125, + "num_tokens": 157682013.0, + "step": 4130 + }, + { + "epoch": 0.5255056608573972, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.7948744297027588, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8597676753997803, + "num_tokens": 157714190.0, + "step": 4131 + }, + { + "epoch": 0.5256328711359878, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6440423727035522, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.881255030632019, + "num_tokens": 157745738.0, + "step": 4132 + }, + { + "epoch": 0.5257600814145783, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5350005626678467, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8682683706283569, + "num_tokens": 157784519.0, + "step": 4133 + }, + { + "epoch": 0.5258872916931688, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.6444690227508545, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8637946844100952, + "num_tokens": 157816182.0, + "step": 4134 + }, + { + "epoch": 0.5260145019717594, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5520381927490234, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8723949193954468, + "num_tokens": 157854157.0, + "step": 4135 + }, + { + "epoch": 0.5261417122503498, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5907831192016602, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8643046617507935, + "num_tokens": 157889547.0, + "step": 4136 + }, + { + "epoch": 0.5262689225289403, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5166844129562378, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8665777444839478, + "num_tokens": 157929564.0, + "step": 4137 + }, + { + "epoch": 0.5263961328075308, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5419330596923828, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8691439628601074, + "num_tokens": 157968660.0, + "step": 4138 + }, + { + "epoch": 0.5265233430861214, + "ewc_loss": 1.4901161193847656e-05, + "grad_norm": 1.5591293573379517, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8676097989082336, + "num_tokens": 158004489.0, + "step": 4139 + }, + { + "epoch": 0.5266505533647119, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 1.5622040033340454, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8671698570251465, + "num_tokens": 158041857.0, + "step": 4140 + }, + { + "epoch": 0.5267777636433024, + "ewc_loss": 1.4960765838623047e-05, + "grad_norm": 1.5925003290176392, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8689796924591064, + "num_tokens": 158079713.0, + "step": 4141 + }, + { + "epoch": 0.5269049739218928, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5213016271591187, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8610935807228088, + "num_tokens": 158120352.0, + "step": 4142 + }, + { + "epoch": 0.5270321842004834, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.459730863571167, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8625768423080444, + "num_tokens": 158161236.0, + "step": 4143 + }, + { + "epoch": 0.5271593944790739, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5419846773147583, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8628726005554199, + "num_tokens": 158204933.0, + "step": 4144 + }, + { + "epoch": 0.5272866047576644, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6722922325134277, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8514648675918579, + "num_tokens": 158242922.0, + "step": 4145 + }, + { + "epoch": 0.527413815036255, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.7550878524780273, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8570230603218079, + "num_tokens": 158277346.0, + "step": 4146 + }, + { + "epoch": 0.5275410253148455, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.608683705329895, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8568810820579529, + "num_tokens": 158316091.0, + "step": 4147 + }, + { + "epoch": 0.5276682355934359, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6139469146728516, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8617160320281982, + "num_tokens": 158353615.0, + "step": 4148 + }, + { + "epoch": 0.5277954458720264, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6669822931289673, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8558130264282227, + "num_tokens": 158394005.0, + "step": 4149 + }, + { + "epoch": 0.527922656150617, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.7329559326171875, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8681672811508179, + "num_tokens": 158430645.0, + "step": 4150 + }, + { + "epoch": 0.5280498664292075, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5845746994018555, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8566458225250244, + "num_tokens": 158470191.0, + "step": 4151 + }, + { + "epoch": 0.528177076707798, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5702515840530396, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8473396301269531, + "num_tokens": 158515054.0, + "step": 4152 + }, + { + "epoch": 0.5283042869863885, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6449761390686035, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.868789792060852, + "num_tokens": 158549904.0, + "step": 4153 + }, + { + "epoch": 0.528431497264979, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6840441226959229, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.861323356628418, + "num_tokens": 158579227.0, + "step": 4154 + }, + { + "epoch": 0.5285587075435695, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5972740650177002, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8598559498786926, + "num_tokens": 158619179.0, + "step": 4155 + }, + { + "epoch": 0.52868591782216, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5749601125717163, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8605130314826965, + "num_tokens": 158658801.0, + "step": 4156 + }, + { + "epoch": 0.5288131281007505, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6257781982421875, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8703523278236389, + "num_tokens": 158696988.0, + "step": 4157 + }, + { + "epoch": 0.5289403383793411, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.531187891960144, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8579210638999939, + "num_tokens": 158739740.0, + "step": 4158 + }, + { + "epoch": 0.5290675486579316, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.3978339433670044, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8745405673980713, + "num_tokens": 158782488.0, + "step": 4159 + }, + { + "epoch": 0.5291947589365221, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6034353971481323, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8670769929885864, + "num_tokens": 158815970.0, + "step": 4160 + }, + { + "epoch": 0.5293219692151125, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.723749041557312, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8354134559631348, + "num_tokens": 158855143.0, + "step": 4161 + }, + { + "epoch": 0.5294491794937031, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6183300018310547, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8653377890586853, + "num_tokens": 158889322.0, + "step": 4162 + }, + { + "epoch": 0.5295763897722936, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6415435075759888, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8652347922325134, + "num_tokens": 158925165.0, + "step": 4163 + }, + { + "epoch": 0.5297036000508841, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5268876552581787, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8672230243682861, + "num_tokens": 158965475.0, + "step": 4164 + }, + { + "epoch": 0.5298308103294747, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6391016244888306, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8551874756813049, + "num_tokens": 159002662.0, + "step": 4165 + }, + { + "epoch": 0.5299580206080652, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6770753860473633, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.860853910446167, + "num_tokens": 159036180.0, + "step": 4166 + }, + { + "epoch": 0.5300852308866556, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6848204135894775, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8571066856384277, + "num_tokens": 159070661.0, + "step": 4167 + }, + { + "epoch": 0.5302124411652461, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5668526887893677, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8802932500839233, + "num_tokens": 159107348.0, + "step": 4168 + }, + { + "epoch": 0.5303396514438367, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5301728248596191, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8709261417388916, + "num_tokens": 159148028.0, + "step": 4169 + }, + { + "epoch": 0.5304668617224272, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6149908304214478, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8685808777809143, + "num_tokens": 159185831.0, + "step": 4170 + }, + { + "epoch": 0.5305940720010177, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5689334869384766, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8573057651519775, + "num_tokens": 159226096.0, + "step": 4171 + }, + { + "epoch": 0.5307212822796082, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5342353582382202, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8744739294052124, + "num_tokens": 159262687.0, + "step": 4172 + }, + { + "epoch": 0.5308484925581987, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.8320056200027466, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8597986698150635, + "num_tokens": 159296956.0, + "step": 4173 + }, + { + "epoch": 0.5309757028367892, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6031060218811035, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8801668286323547, + "num_tokens": 159333320.0, + "step": 4174 + }, + { + "epoch": 0.5311029131153797, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6654409170150757, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8808583617210388, + "num_tokens": 159367477.0, + "step": 4175 + }, + { + "epoch": 0.5312301233939702, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5371403694152832, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.86640864610672, + "num_tokens": 159404901.0, + "step": 4176 + }, + { + "epoch": 0.5313573336725608, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6520029306411743, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8588396906852722, + "num_tokens": 159441137.0, + "step": 4177 + }, + { + "epoch": 0.5314845439511513, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6533700227737427, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8576329946517944, + "num_tokens": 159479921.0, + "step": 4178 + }, + { + "epoch": 0.5316117542297417, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6385434865951538, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8547629714012146, + "num_tokens": 159522052.0, + "step": 4179 + }, + { + "epoch": 0.5317389645083322, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6576383113861084, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8726183772087097, + "num_tokens": 159557134.0, + "step": 4180 + }, + { + "epoch": 0.5318661747869228, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.8026381731033325, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8458408117294312, + "num_tokens": 159589549.0, + "step": 4181 + }, + { + "epoch": 0.5319933850655133, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6803029775619507, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8601891994476318, + "num_tokens": 159625707.0, + "step": 4182 + }, + { + "epoch": 0.5321205953441038, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.7089210748672485, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.870628833770752, + "num_tokens": 159659935.0, + "step": 4183 + }, + { + "epoch": 0.5322478056226944, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.4520937204360962, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8663557767868042, + "num_tokens": 159700985.0, + "step": 4184 + }, + { + "epoch": 0.5323750159012848, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6173481941223145, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8517520427703857, + "num_tokens": 159737102.0, + "step": 4185 + }, + { + "epoch": 0.5325022261798753, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.485718011856079, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8632946014404297, + "num_tokens": 159780154.0, + "step": 4186 + }, + { + "epoch": 0.5326294364584658, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5578011274337769, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.850287675857544, + "num_tokens": 159822087.0, + "step": 4187 + }, + { + "epoch": 0.5327566467370564, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5862687826156616, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.872843861579895, + "num_tokens": 159860421.0, + "step": 4188 + }, + { + "epoch": 0.5328838570156469, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.532116413116455, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8678951263427734, + "num_tokens": 159906207.0, + "step": 4189 + }, + { + "epoch": 0.5330110672942374, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.554253339767456, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8712373971939087, + "num_tokens": 159942611.0, + "step": 4190 + }, + { + "epoch": 0.5331382775728278, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6116142272949219, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8540211915969849, + "num_tokens": 159979988.0, + "step": 4191 + }, + { + "epoch": 0.5332654878514184, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6279691457748413, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8778199553489685, + "num_tokens": 160016156.0, + "step": 4192 + }, + { + "epoch": 0.5333926981300089, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5480713844299316, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8699374198913574, + "num_tokens": 160057045.0, + "step": 4193 + }, + { + "epoch": 0.5335199084085994, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6836782693862915, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8684917092323303, + "num_tokens": 160089063.0, + "step": 4194 + }, + { + "epoch": 0.53364711868719, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.631719946861267, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8760595917701721, + "num_tokens": 160122009.0, + "step": 4195 + }, + { + "epoch": 0.5337743289657805, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6623249053955078, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8668696284294128, + "num_tokens": 160159606.0, + "step": 4196 + }, + { + "epoch": 0.5339015392443709, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.655718445777893, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8640029430389404, + "num_tokens": 160197969.0, + "step": 4197 + }, + { + "epoch": 0.5340287495229614, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6197625398635864, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8497578501701355, + "num_tokens": 160237420.0, + "step": 4198 + }, + { + "epoch": 0.534155959801552, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5684713125228882, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8493406772613525, + "num_tokens": 160276065.0, + "step": 4199 + }, + { + "epoch": 0.5342831700801425, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.620697021484375, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.853729248046875, + "num_tokens": 160313849.0, + "step": 4200 + }, + { + "epoch": 0.534410380358733, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6168195009231567, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8566451072692871, + "num_tokens": 160357299.0, + "step": 4201 + }, + { + "epoch": 0.5345375906373235, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6147420406341553, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8550063371658325, + "num_tokens": 160397409.0, + "step": 4202 + }, + { + "epoch": 0.534664800915914, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6579465866088867, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.870792031288147, + "num_tokens": 160430955.0, + "step": 4203 + }, + { + "epoch": 0.5347920111945045, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6355957984924316, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.865515947341919, + "num_tokens": 160465614.0, + "step": 4204 + }, + { + "epoch": 0.534919221473095, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.5278197526931763, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8640189170837402, + "num_tokens": 160505386.0, + "step": 4205 + }, + { + "epoch": 0.5350464317516855, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.542153239250183, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8549114465713501, + "num_tokens": 160547834.0, + "step": 4206 + }, + { + "epoch": 0.5351736420302761, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.619142770767212, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8648809790611267, + "num_tokens": 160587928.0, + "step": 4207 + }, + { + "epoch": 0.5353008523088666, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.6315006017684937, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8678181171417236, + "num_tokens": 160621263.0, + "step": 4208 + }, + { + "epoch": 0.5354280625874571, + "ewc_loss": 1.5020370483398438e-05, + "grad_norm": 1.8183443546295166, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8548468351364136, + "num_tokens": 160660105.0, + "step": 4209 + }, + { + "epoch": 0.5355552728660475, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6905449628829956, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8518713712692261, + "num_tokens": 160698946.0, + "step": 4210 + }, + { + "epoch": 0.5356824831446381, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6419107913970947, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8703728914260864, + "num_tokens": 160738112.0, + "step": 4211 + }, + { + "epoch": 0.5358096934232286, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 3.6810762882232666, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8551496267318726, + "num_tokens": 160780907.0, + "step": 4212 + }, + { + "epoch": 0.5359369037018191, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5925995111465454, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.839478611946106, + "num_tokens": 160824328.0, + "step": 4213 + }, + { + "epoch": 0.5360641139804097, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6466749906539917, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.85882967710495, + "num_tokens": 160864793.0, + "step": 4214 + }, + { + "epoch": 0.5361913242590002, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6309058666229248, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.863228440284729, + "num_tokens": 160900482.0, + "step": 4215 + }, + { + "epoch": 0.5363185345375906, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6670011281967163, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8515343070030212, + "num_tokens": 160938471.0, + "step": 4216 + }, + { + "epoch": 0.5364457448161811, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6513397693634033, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8550446033477783, + "num_tokens": 160978738.0, + "step": 4217 + }, + { + "epoch": 0.5365729550947717, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.742365837097168, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8516896963119507, + "num_tokens": 161014609.0, + "step": 4218 + }, + { + "epoch": 0.5367001653733622, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5294208526611328, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8690276145935059, + "num_tokens": 161050590.0, + "step": 4219 + }, + { + "epoch": 0.5368273756519527, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6681253910064697, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8468585014343262, + "num_tokens": 161089011.0, + "step": 4220 + }, + { + "epoch": 0.5369545859305432, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.8220536708831787, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8506724834442139, + "num_tokens": 161122317.0, + "step": 4221 + }, + { + "epoch": 0.5370817962091337, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5952509641647339, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8528425693511963, + "num_tokens": 161161484.0, + "step": 4222 + }, + { + "epoch": 0.5372090064877242, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6391383409500122, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8483697175979614, + "num_tokens": 161202268.0, + "step": 4223 + }, + { + "epoch": 0.5373362167663147, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.7181137800216675, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.840366005897522, + "num_tokens": 161239784.0, + "step": 4224 + }, + { + "epoch": 0.5374634270449052, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5890620946884155, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8598102331161499, + "num_tokens": 161280108.0, + "step": 4225 + }, + { + "epoch": 0.5375906373234958, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.546294093132019, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8543702960014343, + "num_tokens": 161321977.0, + "step": 4226 + }, + { + "epoch": 0.5377178476020863, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5569688081741333, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8722842931747437, + "num_tokens": 161358768.0, + "step": 4227 + }, + { + "epoch": 0.5378450578806767, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.8221399784088135, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.851578950881958, + "num_tokens": 161395298.0, + "step": 4228 + }, + { + "epoch": 0.5379722681592672, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.559048056602478, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8714346289634705, + "num_tokens": 161435961.0, + "step": 4229 + }, + { + "epoch": 0.5380994784378578, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6832081079483032, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8501712679862976, + "num_tokens": 161472085.0, + "step": 4230 + }, + { + "epoch": 0.5382266887164483, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5924500226974487, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8694322109222412, + "num_tokens": 161508380.0, + "step": 4231 + }, + { + "epoch": 0.5383538989950388, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6219096183776855, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8739043474197388, + "num_tokens": 161544589.0, + "step": 4232 + }, + { + "epoch": 0.5384811092736294, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6689176559448242, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8578009605407715, + "num_tokens": 161581126.0, + "step": 4233 + }, + { + "epoch": 0.5386083195522198, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.7086728811264038, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8408707976341248, + "num_tokens": 161615684.0, + "step": 4234 + }, + { + "epoch": 0.5387355298308103, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6446056365966797, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.849079430103302, + "num_tokens": 161656449.0, + "step": 4235 + }, + { + "epoch": 0.5388627401094008, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5066723823547363, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8641262054443359, + "num_tokens": 161697597.0, + "step": 4236 + }, + { + "epoch": 0.5389899503879914, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6215873956680298, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8529878854751587, + "num_tokens": 161738838.0, + "step": 4237 + }, + { + "epoch": 0.5391171606665819, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5975408554077148, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.853333592414856, + "num_tokens": 161773987.0, + "step": 4238 + }, + { + "epoch": 0.5392443709451724, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5915454626083374, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8564294576644897, + "num_tokens": 161812604.0, + "step": 4239 + }, + { + "epoch": 0.5393715812237628, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5823900699615479, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8620238900184631, + "num_tokens": 161850641.0, + "step": 4240 + }, + { + "epoch": 0.5394987915023534, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.507675051689148, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8582684993743896, + "num_tokens": 161891510.0, + "step": 4241 + }, + { + "epoch": 0.5396260017809439, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5054633617401123, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8701488971710205, + "num_tokens": 161928370.0, + "step": 4242 + }, + { + "epoch": 0.5397532120595344, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5967061519622803, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8442769646644592, + "num_tokens": 161965900.0, + "step": 4243 + }, + { + "epoch": 0.5398804223381249, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.4798762798309326, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.875840961933136, + "num_tokens": 162007438.0, + "step": 4244 + }, + { + "epoch": 0.5400076326167155, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.621838092803955, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8481228351593018, + "num_tokens": 162049094.0, + "step": 4245 + }, + { + "epoch": 0.5401348428953059, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6297004222869873, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8668325543403625, + "num_tokens": 162085177.0, + "step": 4246 + }, + { + "epoch": 0.5402620531738964, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.574042797088623, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8690761923789978, + "num_tokens": 162121228.0, + "step": 4247 + }, + { + "epoch": 0.540389263452487, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5935109853744507, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8744367957115173, + "num_tokens": 162159736.0, + "step": 4248 + }, + { + "epoch": 0.5405164737310775, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.4968931674957275, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.866777241230011, + "num_tokens": 162203081.0, + "step": 4249 + }, + { + "epoch": 0.540643684009668, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5893495082855225, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8668925762176514, + "num_tokens": 162242272.0, + "step": 4250 + }, + { + "epoch": 0.5407708942882585, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6908260583877563, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8693832159042358, + "num_tokens": 162276353.0, + "step": 4251 + }, + { + "epoch": 0.540898104566849, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6050896644592285, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8627851009368896, + "num_tokens": 162313017.0, + "step": 4252 + }, + { + "epoch": 0.5410253148454395, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.477239727973938, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.858551561832428, + "num_tokens": 162359809.0, + "step": 4253 + }, + { + "epoch": 0.54115252512403, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6294748783111572, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8570876717567444, + "num_tokens": 162400367.0, + "step": 4254 + }, + { + "epoch": 0.5412797354026205, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 3.792832136154175, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8587841987609863, + "num_tokens": 162442009.0, + "step": 4255 + }, + { + "epoch": 0.5414069456812111, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.8063730001449585, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8637830018997192, + "num_tokens": 162478889.0, + "step": 4256 + }, + { + "epoch": 0.5415341559598016, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6061153411865234, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8553087115287781, + "num_tokens": 162520758.0, + "step": 4257 + }, + { + "epoch": 0.5416613662383921, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6753768920898438, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.854777455329895, + "num_tokens": 162557533.0, + "step": 4258 + }, + { + "epoch": 0.5417885765169825, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.4713214635849, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8715219497680664, + "num_tokens": 162601033.0, + "step": 4259 + }, + { + "epoch": 0.5419157867955731, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5746617317199707, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8628814816474915, + "num_tokens": 162640740.0, + "step": 4260 + }, + { + "epoch": 0.5420429970741636, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.7669258117675781, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.854106068611145, + "num_tokens": 162671266.0, + "step": 4261 + }, + { + "epoch": 0.5421702073527541, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5517191886901855, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.863346517086029, + "num_tokens": 162708957.0, + "step": 4262 + }, + { + "epoch": 0.5422974176313446, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.5474146604537964, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8638712167739868, + "num_tokens": 162749791.0, + "step": 4263 + }, + { + "epoch": 0.5424246279099352, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.580011010169983, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.873849093914032, + "num_tokens": 162782121.0, + "step": 4264 + }, + { + "epoch": 0.5425518381885256, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6144851446151733, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8625325560569763, + "num_tokens": 162821392.0, + "step": 4265 + }, + { + "epoch": 0.5426790484671161, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.7388639450073242, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8537875413894653, + "num_tokens": 162853436.0, + "step": 4266 + }, + { + "epoch": 0.5428062587457066, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6961629390716553, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8681219816207886, + "num_tokens": 162884083.0, + "step": 4267 + }, + { + "epoch": 0.5429334690242972, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.7990292310714722, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8676759004592896, + "num_tokens": 162919840.0, + "step": 4268 + }, + { + "epoch": 0.5430606793028877, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6311208009719849, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8640049695968628, + "num_tokens": 162960299.0, + "step": 4269 + }, + { + "epoch": 0.5431878895814782, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.530045747756958, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8581897020339966, + "num_tokens": 163003606.0, + "step": 4270 + }, + { + "epoch": 0.5433150998600687, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6464413404464722, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8615902066230774, + "num_tokens": 163041326.0, + "step": 4271 + }, + { + "epoch": 0.5434423101386592, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.4435335397720337, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8648267388343811, + "num_tokens": 163085375.0, + "step": 4272 + }, + { + "epoch": 0.5435695204172497, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.4813518524169922, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8673825860023499, + "num_tokens": 163124974.0, + "step": 4273 + }, + { + "epoch": 0.5436967306958402, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.6095589399337769, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8673476576805115, + "num_tokens": 163159109.0, + "step": 4274 + }, + { + "epoch": 0.5438239409744308, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.740255355834961, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8514115810394287, + "num_tokens": 163194729.0, + "step": 4275 + }, + { + "epoch": 0.5439511512530213, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.7106707096099854, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.860920250415802, + "num_tokens": 163229372.0, + "step": 4276 + }, + { + "epoch": 0.5440783615316117, + "ewc_loss": 1.5079975128173828e-05, + "grad_norm": 1.540272831916809, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.872991681098938, + "num_tokens": 163265798.0, + "step": 4277 + }, + { + "epoch": 0.5442055718102022, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.617881178855896, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8533821105957031, + "num_tokens": 163303851.0, + "step": 4278 + }, + { + "epoch": 0.5443327820887928, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5678229331970215, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8617246150970459, + "num_tokens": 163341340.0, + "step": 4279 + }, + { + "epoch": 0.5444599923673833, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5821219682693481, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8632665872573853, + "num_tokens": 163380165.0, + "step": 4280 + }, + { + "epoch": 0.5445872026459738, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6736257076263428, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8488565683364868, + "num_tokens": 163419041.0, + "step": 4281 + }, + { + "epoch": 0.5447144129245644, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5593513250350952, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8579748868942261, + "num_tokens": 163459604.0, + "step": 4282 + }, + { + "epoch": 0.5448416232031548, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6528511047363281, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8676650524139404, + "num_tokens": 163495297.0, + "step": 4283 + }, + { + "epoch": 0.5449688334817453, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5554627180099487, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8714589476585388, + "num_tokens": 163528156.0, + "step": 4284 + }, + { + "epoch": 0.5450960437603358, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5722399950027466, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8679261803627014, + "num_tokens": 163563231.0, + "step": 4285 + }, + { + "epoch": 0.5452232540389264, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.488729476928711, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.869405210018158, + "num_tokens": 163601666.0, + "step": 4286 + }, + { + "epoch": 0.5453504643175169, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6866919994354248, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8644271492958069, + "num_tokens": 163634438.0, + "step": 4287 + }, + { + "epoch": 0.5454776745961074, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5326706171035767, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8589828014373779, + "num_tokens": 163678405.0, + "step": 4288 + }, + { + "epoch": 0.5456048848746978, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.7426449060440063, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8572397232055664, + "num_tokens": 163712751.0, + "step": 4289 + }, + { + "epoch": 0.5457320951532884, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.8096237182617188, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8485079407691956, + "num_tokens": 163744458.0, + "step": 4290 + }, + { + "epoch": 0.5458593054318789, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6924234628677368, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8581405878067017, + "num_tokens": 163777265.0, + "step": 4291 + }, + { + "epoch": 0.5459865157104694, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5543304681777954, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8731650114059448, + "num_tokens": 163816160.0, + "step": 4292 + }, + { + "epoch": 0.5461137259890599, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6703139543533325, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8528895974159241, + "num_tokens": 163853038.0, + "step": 4293 + }, + { + "epoch": 0.5462409362676505, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6857775449752808, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8428802490234375, + "num_tokens": 163894479.0, + "step": 4294 + }, + { + "epoch": 0.5463681465462409, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.649748682975769, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8411992192268372, + "num_tokens": 163931441.0, + "step": 4295 + }, + { + "epoch": 0.5464953568248314, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.420190691947937, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.877208948135376, + "num_tokens": 163979507.0, + "step": 4296 + }, + { + "epoch": 0.5466225671034219, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.4888852834701538, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.862288236618042, + "num_tokens": 164022013.0, + "step": 4297 + }, + { + "epoch": 0.5467497773820125, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.573163628578186, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8589123487472534, + "num_tokens": 164058227.0, + "step": 4298 + }, + { + "epoch": 0.546876987660603, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6484010219573975, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8551762104034424, + "num_tokens": 164096576.0, + "step": 4299 + }, + { + "epoch": 0.5470041979391935, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6325139999389648, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8672999739646912, + "num_tokens": 164137071.0, + "step": 4300 + }, + { + "epoch": 0.5471314082177839, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.406484842300415, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8612458109855652, + "num_tokens": 164185603.0, + "step": 4301 + }, + { + "epoch": 0.5472586184963745, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5856612920761108, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8492333889007568, + "num_tokens": 164226048.0, + "step": 4302 + }, + { + "epoch": 0.547385828774965, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5382272005081177, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8562007546424866, + "num_tokens": 164270547.0, + "step": 4303 + }, + { + "epoch": 0.5475130390535555, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6794261932373047, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8650450110435486, + "num_tokens": 164306703.0, + "step": 4304 + }, + { + "epoch": 0.5476402493321461, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5066492557525635, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8660826683044434, + "num_tokens": 164346569.0, + "step": 4305 + }, + { + "epoch": 0.5477674596107366, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5714633464813232, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8451713919639587, + "num_tokens": 164385574.0, + "step": 4306 + }, + { + "epoch": 0.5478946698893271, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.501095175743103, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8744368553161621, + "num_tokens": 164426639.0, + "step": 4307 + }, + { + "epoch": 0.5480218801679175, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5935351848602295, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8565276265144348, + "num_tokens": 164462904.0, + "step": 4308 + }, + { + "epoch": 0.5481490904465081, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.4863297939300537, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8566497564315796, + "num_tokens": 164505745.0, + "step": 4309 + }, + { + "epoch": 0.5482763007250986, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.7479463815689087, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8572108149528503, + "num_tokens": 164538598.0, + "step": 4310 + }, + { + "epoch": 0.5484035110036891, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.581649899482727, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8566845655441284, + "num_tokens": 164579761.0, + "step": 4311 + }, + { + "epoch": 0.5485307212822796, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.7106194496154785, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8614256381988525, + "num_tokens": 164611007.0, + "step": 4312 + }, + { + "epoch": 0.5486579315608702, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.641037106513977, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8470866084098816, + "num_tokens": 164655587.0, + "step": 4313 + }, + { + "epoch": 0.5487851418394606, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.647798776626587, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8613739609718323, + "num_tokens": 164693426.0, + "step": 4314 + }, + { + "epoch": 0.5489123521180511, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.683477759361267, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8590598106384277, + "num_tokens": 164725991.0, + "step": 4315 + }, + { + "epoch": 0.5490395623966416, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6259843111038208, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8581078052520752, + "num_tokens": 164760776.0, + "step": 4316 + }, + { + "epoch": 0.5491667726752322, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5999863147735596, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8645040988922119, + "num_tokens": 164791712.0, + "step": 4317 + }, + { + "epoch": 0.5492939829538227, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5131888389587402, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8542922735214233, + "num_tokens": 164832919.0, + "step": 4318 + }, + { + "epoch": 0.5494211932324132, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5069166421890259, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8661974668502808, + "num_tokens": 164874472.0, + "step": 4319 + }, + { + "epoch": 0.5495484035110036, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.652595043182373, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8650778532028198, + "num_tokens": 164911136.0, + "step": 4320 + }, + { + "epoch": 0.5496756137895942, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5936709642410278, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8622245788574219, + "num_tokens": 164946603.0, + "step": 4321 + }, + { + "epoch": 0.5498028240681847, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6563295125961304, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8544811010360718, + "num_tokens": 164981457.0, + "step": 4322 + }, + { + "epoch": 0.5499300343467752, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6462630033493042, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8489618301391602, + "num_tokens": 165018380.0, + "step": 4323 + }, + { + "epoch": 0.5500572446253658, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5743327140808105, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8656177520751953, + "num_tokens": 165056600.0, + "step": 4324 + }, + { + "epoch": 0.5501844549039563, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.807743787765503, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8620408177375793, + "num_tokens": 165091177.0, + "step": 4325 + }, + { + "epoch": 0.5503116651825467, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5545012950897217, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8699294328689575, + "num_tokens": 165129446.0, + "step": 4326 + }, + { + "epoch": 0.5504388754611372, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.4723539352416992, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8708246946334839, + "num_tokens": 165174503.0, + "step": 4327 + }, + { + "epoch": 0.5505660857397278, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.694064736366272, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.857638418674469, + "num_tokens": 165211621.0, + "step": 4328 + }, + { + "epoch": 0.5506932960183183, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.4958189725875854, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8693554401397705, + "num_tokens": 165253115.0, + "step": 4329 + }, + { + "epoch": 0.5508205062969088, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6983078718185425, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8399753570556641, + "num_tokens": 165290245.0, + "step": 4330 + }, + { + "epoch": 0.5509477165754993, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5188695192337036, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8714781999588013, + "num_tokens": 165329788.0, + "step": 4331 + }, + { + "epoch": 0.5510749268540898, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.4853532314300537, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8756479024887085, + "num_tokens": 165371700.0, + "step": 4332 + }, + { + "epoch": 0.5512021371326803, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6552790403366089, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8637872934341431, + "num_tokens": 165409752.0, + "step": 4333 + }, + { + "epoch": 0.5513293474112708, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.7599642276763916, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8378658294677734, + "num_tokens": 165447929.0, + "step": 4334 + }, + { + "epoch": 0.5514565576898613, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5494558811187744, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8599157929420471, + "num_tokens": 165487076.0, + "step": 4335 + }, + { + "epoch": 0.5515837679684519, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6106598377227783, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8563290238380432, + "num_tokens": 165523895.0, + "step": 4336 + }, + { + "epoch": 0.5517109782470424, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6967004537582397, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8694167137145996, + "num_tokens": 165556770.0, + "step": 4337 + }, + { + "epoch": 0.5518381885256328, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.6022251844406128, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8653409481048584, + "num_tokens": 165598223.0, + "step": 4338 + }, + { + "epoch": 0.5519653988042234, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.669100284576416, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.858598530292511, + "num_tokens": 165633731.0, + "step": 4339 + }, + { + "epoch": 0.5520926090828139, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.5596892833709717, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8777645826339722, + "num_tokens": 165670935.0, + "step": 4340 + }, + { + "epoch": 0.5522198193614044, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.506773591041565, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.870834469795227, + "num_tokens": 165709169.0, + "step": 4341 + }, + { + "epoch": 0.5523470296399949, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.593342661857605, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8616666793823242, + "num_tokens": 165750150.0, + "step": 4342 + }, + { + "epoch": 0.5524742399185855, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 1.8413139581680298, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8590047955513, + "num_tokens": 165782670.0, + "step": 4343 + }, + { + "epoch": 0.5526014501971759, + "ewc_loss": 1.5139579772949219e-05, + "grad_norm": 2.0662193298339844, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8424985408782959, + "num_tokens": 165822526.0, + "step": 4344 + }, + { + "epoch": 0.5527286604757664, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7298415899276733, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8498454093933105, + "num_tokens": 165856324.0, + "step": 4345 + }, + { + "epoch": 0.5528558707543569, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7530750036239624, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8604736328125, + "num_tokens": 165890182.0, + "step": 4346 + }, + { + "epoch": 0.5529830810329475, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.9326093196868896, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8782840967178345, + "num_tokens": 165923014.0, + "step": 4347 + }, + { + "epoch": 0.553110291311538, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5569947957992554, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8626388311386108, + "num_tokens": 165963561.0, + "step": 4348 + }, + { + "epoch": 0.5532375015901285, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5216270685195923, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8731241226196289, + "num_tokens": 165998700.0, + "step": 4349 + }, + { + "epoch": 0.5533647118687189, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6215662956237793, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8707361817359924, + "num_tokens": 166033424.0, + "step": 4350 + }, + { + "epoch": 0.5534919221473095, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.75594162940979, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8539246320724487, + "num_tokens": 166069755.0, + "step": 4351 + }, + { + "epoch": 0.5536191324259, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6378027200698853, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8725975751876831, + "num_tokens": 166108066.0, + "step": 4352 + }, + { + "epoch": 0.5537463427044905, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.611812710762024, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.862834095954895, + "num_tokens": 166146207.0, + "step": 4353 + }, + { + "epoch": 0.553873552983081, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6816236972808838, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8631850481033325, + "num_tokens": 166180762.0, + "step": 4354 + }, + { + "epoch": 0.5540007632616716, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7527804374694824, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8394738435745239, + "num_tokens": 166220609.0, + "step": 4355 + }, + { + "epoch": 0.554127973540262, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5837215185165405, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8477144837379456, + "num_tokens": 166260454.0, + "step": 4356 + }, + { + "epoch": 0.5542551838188525, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5102335214614868, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8686551451683044, + "num_tokens": 166300811.0, + "step": 4357 + }, + { + "epoch": 0.554382394097443, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6568504571914673, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8512701392173767, + "num_tokens": 166337683.0, + "step": 4358 + }, + { + "epoch": 0.5545096043760336, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5681540966033936, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8726889491081238, + "num_tokens": 166373841.0, + "step": 4359 + }, + { + "epoch": 0.5546368146546241, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5992203950881958, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8677833080291748, + "num_tokens": 166410933.0, + "step": 4360 + }, + { + "epoch": 0.5547640249332146, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7767045497894287, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8427579402923584, + "num_tokens": 166443188.0, + "step": 4361 + }, + { + "epoch": 0.5548912352118052, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.588862419128418, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8604899644851685, + "num_tokens": 166486515.0, + "step": 4362 + }, + { + "epoch": 0.5550184454903956, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.4888737201690674, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8685773015022278, + "num_tokens": 166526664.0, + "step": 4363 + }, + { + "epoch": 0.5551456557689861, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.638298749923706, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8680006265640259, + "num_tokens": 166562544.0, + "step": 4364 + }, + { + "epoch": 0.5552728660475766, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.4963572025299072, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8513319492340088, + "num_tokens": 166603463.0, + "step": 4365 + }, + { + "epoch": 0.5554000763261672, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6702585220336914, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8440166711807251, + "num_tokens": 166645034.0, + "step": 4366 + }, + { + "epoch": 0.5555272866047577, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7655551433563232, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8526933193206787, + "num_tokens": 166683715.0, + "step": 4367 + }, + { + "epoch": 0.5556544968833482, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5881975889205933, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8659647703170776, + "num_tokens": 166720900.0, + "step": 4368 + }, + { + "epoch": 0.5557817071619386, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5059351921081543, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8533087372779846, + "num_tokens": 166763194.0, + "step": 4369 + }, + { + "epoch": 0.5559089174405292, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.601173758506775, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8685265779495239, + "num_tokens": 166802575.0, + "step": 4370 + }, + { + "epoch": 0.5560361277191197, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.8608677387237549, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8410294055938721, + "num_tokens": 166836914.0, + "step": 4371 + }, + { + "epoch": 0.5561633379977102, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6884878873825073, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8474332094192505, + "num_tokens": 166873353.0, + "step": 4372 + }, + { + "epoch": 0.5562905482763008, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6353669166564941, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8554797768592834, + "num_tokens": 166909462.0, + "step": 4373 + }, + { + "epoch": 0.5564177585548913, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5346249341964722, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8733322620391846, + "num_tokens": 166946652.0, + "step": 4374 + }, + { + "epoch": 0.5565449688334817, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.654183268547058, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8535159826278687, + "num_tokens": 166982424.0, + "step": 4375 + }, + { + "epoch": 0.5566721791120722, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.8408178091049194, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8595638275146484, + "num_tokens": 167017272.0, + "step": 4376 + }, + { + "epoch": 0.5567993893906628, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5804688930511475, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8510890007019043, + "num_tokens": 167060499.0, + "step": 4377 + }, + { + "epoch": 0.5569265996692533, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5660651922225952, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8545457720756531, + "num_tokens": 167102277.0, + "step": 4378 + }, + { + "epoch": 0.5570538099478438, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6020560264587402, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8546957969665527, + "num_tokens": 167139161.0, + "step": 4379 + }, + { + "epoch": 0.5571810202264343, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5980185270309448, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8699057102203369, + "num_tokens": 167174465.0, + "step": 4380 + }, + { + "epoch": 0.5573082305050248, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.658408522605896, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8594518899917603, + "num_tokens": 167215603.0, + "step": 4381 + }, + { + "epoch": 0.5574354407836153, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6956206560134888, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8640128374099731, + "num_tokens": 167250496.0, + "step": 4382 + }, + { + "epoch": 0.5575626510622058, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6010856628417969, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8692911863327026, + "num_tokens": 167287851.0, + "step": 4383 + }, + { + "epoch": 0.5576898613407963, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6122251749038696, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8560881018638611, + "num_tokens": 167327633.0, + "step": 4384 + }, + { + "epoch": 0.5578170716193869, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.541219711303711, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8695632219314575, + "num_tokens": 167368574.0, + "step": 4385 + }, + { + "epoch": 0.5579442818979774, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.463350534439087, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8727143406867981, + "num_tokens": 167410520.0, + "step": 4386 + }, + { + "epoch": 0.5580714921765678, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5936087369918823, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8681820631027222, + "num_tokens": 167446596.0, + "step": 4387 + }, + { + "epoch": 0.5581987024551583, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.465187430381775, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8760440945625305, + "num_tokens": 167488802.0, + "step": 4388 + }, + { + "epoch": 0.5583259127337489, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6815993785858154, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8601347208023071, + "num_tokens": 167526714.0, + "step": 4389 + }, + { + "epoch": 0.5584531230123394, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.587355375289917, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8769510984420776, + "num_tokens": 167564661.0, + "step": 4390 + }, + { + "epoch": 0.5585803332909299, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5201189517974854, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.852497935295105, + "num_tokens": 167607484.0, + "step": 4391 + }, + { + "epoch": 0.5587075435695205, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.662928819656372, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8585879802703857, + "num_tokens": 167645596.0, + "step": 4392 + }, + { + "epoch": 0.5588347538481109, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5798254013061523, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.886665940284729, + "num_tokens": 167679773.0, + "step": 4393 + }, + { + "epoch": 0.5589619641267014, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.4958720207214355, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8767850399017334, + "num_tokens": 167717448.0, + "step": 4394 + }, + { + "epoch": 0.5590891744052919, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.650628924369812, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8692249059677124, + "num_tokens": 167754938.0, + "step": 4395 + }, + { + "epoch": 0.5592163846838825, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6996833086013794, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.859104335308075, + "num_tokens": 167791290.0, + "step": 4396 + }, + { + "epoch": 0.559343594962473, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.4966312646865845, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8685551881790161, + "num_tokens": 167831240.0, + "step": 4397 + }, + { + "epoch": 0.5594708052410635, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.625864863395691, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8570718765258789, + "num_tokens": 167871305.0, + "step": 4398 + }, + { + "epoch": 0.5595980155196539, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6890805959701538, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8486077785491943, + "num_tokens": 167906508.0, + "step": 4399 + }, + { + "epoch": 0.5597252257982445, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.596471905708313, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8563355803489685, + "num_tokens": 167943268.0, + "step": 4400 + }, + { + "epoch": 0.559852436076835, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5660964250564575, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8464038968086243, + "num_tokens": 167985117.0, + "step": 4401 + }, + { + "epoch": 0.5599796463554255, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6522305011749268, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8412800431251526, + "num_tokens": 168020212.0, + "step": 4402 + }, + { + "epoch": 0.560106856634016, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.573622465133667, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8599361777305603, + "num_tokens": 168060140.0, + "step": 4403 + }, + { + "epoch": 0.5602340669126066, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6889973878860474, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8646591305732727, + "num_tokens": 168093644.0, + "step": 4404 + }, + { + "epoch": 0.560361277191197, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7092764377593994, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8544713854789734, + "num_tokens": 168130061.0, + "step": 4405 + }, + { + "epoch": 0.5604884874697875, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6418780088424683, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8684269189834595, + "num_tokens": 168165160.0, + "step": 4406 + }, + { + "epoch": 0.560615697748378, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6716477870941162, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8781876564025879, + "num_tokens": 168199427.0, + "step": 4407 + }, + { + "epoch": 0.5607429080269686, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6498303413391113, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8737632632255554, + "num_tokens": 168231577.0, + "step": 4408 + }, + { + "epoch": 0.5608701183055591, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6509257555007935, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.858767032623291, + "num_tokens": 168266755.0, + "step": 4409 + }, + { + "epoch": 0.5609973285841496, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.555559754371643, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8586324453353882, + "num_tokens": 168307338.0, + "step": 4410 + }, + { + "epoch": 0.5611245388627402, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5959469079971313, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8776675462722778, + "num_tokens": 168340805.0, + "step": 4411 + }, + { + "epoch": 0.5612517491413306, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5105198621749878, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8565585613250732, + "num_tokens": 168378793.0, + "step": 4412 + }, + { + "epoch": 0.5613789594199211, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5167139768600464, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.85910964012146, + "num_tokens": 168420769.0, + "step": 4413 + }, + { + "epoch": 0.5615061696985116, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6149933338165283, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8550866842269897, + "num_tokens": 168458143.0, + "step": 4414 + }, + { + "epoch": 0.5616333799771022, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.589241623878479, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8607131242752075, + "num_tokens": 168493162.0, + "step": 4415 + }, + { + "epoch": 0.5617605902556927, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5072894096374512, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8643229007720947, + "num_tokens": 168537058.0, + "step": 4416 + }, + { + "epoch": 0.5618878005342832, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5830988883972168, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8670872449874878, + "num_tokens": 168575371.0, + "step": 4417 + }, + { + "epoch": 0.5620150108128736, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6089118719100952, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8765653967857361, + "num_tokens": 168614142.0, + "step": 4418 + }, + { + "epoch": 0.5621422210914642, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6363393068313599, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8508545160293579, + "num_tokens": 168650024.0, + "step": 4419 + }, + { + "epoch": 0.5622694313700547, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5695035457611084, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.860073447227478, + "num_tokens": 168691668.0, + "step": 4420 + }, + { + "epoch": 0.5623966416486452, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.754546046257019, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8540762662887573, + "num_tokens": 168725845.0, + "step": 4421 + }, + { + "epoch": 0.5625238519272358, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.4719537496566772, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8574979305267334, + "num_tokens": 168767736.0, + "step": 4422 + }, + { + "epoch": 0.5626510622058263, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6256190538406372, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8522696495056152, + "num_tokens": 168806834.0, + "step": 4423 + }, + { + "epoch": 0.5627782724844167, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6794899702072144, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8607081174850464, + "num_tokens": 168841409.0, + "step": 4424 + }, + { + "epoch": 0.5629054827630072, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5722311735153198, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.857975959777832, + "num_tokens": 168880419.0, + "step": 4425 + }, + { + "epoch": 0.5630326930415978, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5770175457000732, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8520189523696899, + "num_tokens": 168918269.0, + "step": 4426 + }, + { + "epoch": 0.5631599033201883, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6496104001998901, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.858064591884613, + "num_tokens": 168956209.0, + "step": 4427 + }, + { + "epoch": 0.5632871135987788, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6042546033859253, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8590105772018433, + "num_tokens": 168991288.0, + "step": 4428 + }, + { + "epoch": 0.5634143238773693, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6008903980255127, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8720918893814087, + "num_tokens": 169028287.0, + "step": 4429 + }, + { + "epoch": 0.5635415341559598, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5631521940231323, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8723282814025879, + "num_tokens": 169064657.0, + "step": 4430 + }, + { + "epoch": 0.5636687444345503, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.4964560270309448, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8771273493766785, + "num_tokens": 169103721.0, + "step": 4431 + }, + { + "epoch": 0.5637959547131408, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.649398684501648, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8636643886566162, + "num_tokens": 169136175.0, + "step": 4432 + }, + { + "epoch": 0.5639231649917313, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7003788948059082, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8683150410652161, + "num_tokens": 169170110.0, + "step": 4433 + }, + { + "epoch": 0.5640503752703219, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5322872400283813, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8706746101379395, + "num_tokens": 169213178.0, + "step": 4434 + }, + { + "epoch": 0.5641775855489124, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5190225839614868, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8697661757469177, + "num_tokens": 169253544.0, + "step": 4435 + }, + { + "epoch": 0.5643047958275028, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.584414005279541, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.861016035079956, + "num_tokens": 169291552.0, + "step": 4436 + }, + { + "epoch": 0.5644320061060933, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.428998589515686, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8878092765808105, + "num_tokens": 169332441.0, + "step": 4437 + }, + { + "epoch": 0.5645592163846839, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6631304025650024, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8409589529037476, + "num_tokens": 169374900.0, + "step": 4438 + }, + { + "epoch": 0.5646864266632744, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6351515054702759, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8694732785224915, + "num_tokens": 169414905.0, + "step": 4439 + }, + { + "epoch": 0.5648136369418649, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5870198011398315, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8673725128173828, + "num_tokens": 169453846.0, + "step": 4440 + }, + { + "epoch": 0.5649408472204555, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5831025838851929, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8799036145210266, + "num_tokens": 169489925.0, + "step": 4441 + }, + { + "epoch": 0.5650680574990459, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6564315557479858, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8511641025543213, + "num_tokens": 169527675.0, + "step": 4442 + }, + { + "epoch": 0.5651952677776364, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6453320980072021, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.861257791519165, + "num_tokens": 169563221.0, + "step": 4443 + }, + { + "epoch": 0.5653224780562269, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.815506100654602, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8487061262130737, + "num_tokens": 169600038.0, + "step": 4444 + }, + { + "epoch": 0.5654496883348175, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7268283367156982, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.856020450592041, + "num_tokens": 169637513.0, + "step": 4445 + }, + { + "epoch": 0.565576898613408, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.696300983428955, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8476550579071045, + "num_tokens": 169672375.0, + "step": 4446 + }, + { + "epoch": 0.5657041088919985, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5181177854537964, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.860339879989624, + "num_tokens": 169716801.0, + "step": 4447 + }, + { + "epoch": 0.5658313191705889, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6405763626098633, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8603824973106384, + "num_tokens": 169751058.0, + "step": 4448 + }, + { + "epoch": 0.5659585294491795, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5490012168884277, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8706079125404358, + "num_tokens": 169794841.0, + "step": 4449 + }, + { + "epoch": 0.56608573972777, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6198228597640991, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8593192100524902, + "num_tokens": 169837952.0, + "step": 4450 + }, + { + "epoch": 0.5662129500063605, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.533957600593567, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8544718027114868, + "num_tokens": 169879681.0, + "step": 4451 + }, + { + "epoch": 0.566340160284951, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.601514220237732, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8638721704483032, + "num_tokens": 169915101.0, + "step": 4452 + }, + { + "epoch": 0.5664673705635416, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7262980937957764, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8625721335411072, + "num_tokens": 169949162.0, + "step": 4453 + }, + { + "epoch": 0.566594580842132, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6371841430664062, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.852025032043457, + "num_tokens": 169990361.0, + "step": 4454 + }, + { + "epoch": 0.5667217911207225, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.5171631574630737, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8757245540618896, + "num_tokens": 170031872.0, + "step": 4455 + }, + { + "epoch": 0.566849001399313, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.6839056015014648, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8468058109283447, + "num_tokens": 170067405.0, + "step": 4456 + }, + { + "epoch": 0.5669762116779036, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6244728565216064, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8657307028770447, + "num_tokens": 170109888.0, + "step": 4457 + }, + { + "epoch": 0.5671034219564941, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.733272910118103, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.847990870475769, + "num_tokens": 170145085.0, + "step": 4458 + }, + { + "epoch": 0.5672306322350846, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.624224305152893, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.864309549331665, + "num_tokens": 170182497.0, + "step": 4459 + }, + { + "epoch": 0.5673578425136752, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.8174359798431396, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8449214696884155, + "num_tokens": 170217204.0, + "step": 4460 + }, + { + "epoch": 0.5674850527922656, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 1.7423019409179688, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8571369051933289, + "num_tokens": 170250927.0, + "step": 4461 + }, + { + "epoch": 0.5676122630708561, + "ewc_loss": 1.519918441772461e-05, + "grad_norm": 2.1722872257232666, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8631669282913208, + "num_tokens": 170285530.0, + "step": 4462 + }, + { + "epoch": 0.5677394733494466, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.643008828163147, + "learning_rate": 1e-06, + "loss": 0.4973, + "mean_token_accuracy": 0.8386372327804565, + "num_tokens": 170324231.0, + "step": 4463 + }, + { + "epoch": 0.5678666836280372, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6672312021255493, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8490163087844849, + "num_tokens": 170360799.0, + "step": 4464 + }, + { + "epoch": 0.5679938939066277, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5532678365707397, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8613463640213013, + "num_tokens": 170402197.0, + "step": 4465 + }, + { + "epoch": 0.5681211041852182, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.752845048904419, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.862274169921875, + "num_tokens": 170435887.0, + "step": 4466 + }, + { + "epoch": 0.5682483144638086, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6126033067703247, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8625643849372864, + "num_tokens": 170470331.0, + "step": 4467 + }, + { + "epoch": 0.5683755247423992, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6774908304214478, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8720822334289551, + "num_tokens": 170506809.0, + "step": 4468 + }, + { + "epoch": 0.5685027350209897, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.660068154335022, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.872724175453186, + "num_tokens": 170541499.0, + "step": 4469 + }, + { + "epoch": 0.5686299452995802, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.529784083366394, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8556369543075562, + "num_tokens": 170583685.0, + "step": 4470 + }, + { + "epoch": 0.5687571555781707, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.7940574884414673, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8525944948196411, + "num_tokens": 170617649.0, + "step": 4471 + }, + { + "epoch": 0.5688843658567613, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.7093639373779297, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8636581897735596, + "num_tokens": 170649728.0, + "step": 4472 + }, + { + "epoch": 0.5690115761353517, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5418344736099243, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8749125599861145, + "num_tokens": 170689346.0, + "step": 4473 + }, + { + "epoch": 0.5691387864139422, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6923316717147827, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8718609809875488, + "num_tokens": 170725009.0, + "step": 4474 + }, + { + "epoch": 0.5692659966925327, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6906458139419556, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8608251810073853, + "num_tokens": 170760707.0, + "step": 4475 + }, + { + "epoch": 0.5693932069711233, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.8307242393493652, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.86540287733078, + "num_tokens": 170793967.0, + "step": 4476 + }, + { + "epoch": 0.5695204172497138, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6132653951644897, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8596305847167969, + "num_tokens": 170829883.0, + "step": 4477 + }, + { + "epoch": 0.5696476275283043, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5181341171264648, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8754242658615112, + "num_tokens": 170867922.0, + "step": 4478 + }, + { + "epoch": 0.5697748378068948, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.408693552017212, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8637250661849976, + "num_tokens": 170915043.0, + "step": 4479 + }, + { + "epoch": 0.5699020480854853, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6550254821777344, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8566912412643433, + "num_tokens": 170952122.0, + "step": 4480 + }, + { + "epoch": 0.5700292583640758, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5876258611679077, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8617424368858337, + "num_tokens": 170987715.0, + "step": 4481 + }, + { + "epoch": 0.5701564686426663, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5336322784423828, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8571484684944153, + "num_tokens": 171030584.0, + "step": 4482 + }, + { + "epoch": 0.5702836789212569, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6915069818496704, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8688827753067017, + "num_tokens": 171067576.0, + "step": 4483 + }, + { + "epoch": 0.5704108891998474, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5965428352355957, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8729731440544128, + "num_tokens": 171105221.0, + "step": 4484 + }, + { + "epoch": 0.5705380994784378, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5723896026611328, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8658859729766846, + "num_tokens": 171143823.0, + "step": 4485 + }, + { + "epoch": 0.5706653097570283, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6100740432739258, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8750542402267456, + "num_tokens": 171177543.0, + "step": 4486 + }, + { + "epoch": 0.5707925200356189, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.6498241424560547, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.852326512336731, + "num_tokens": 171220973.0, + "step": 4487 + }, + { + "epoch": 0.5709197303142094, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.8075556755065918, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8645627498626709, + "num_tokens": 171255230.0, + "step": 4488 + }, + { + "epoch": 0.5710469405927999, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.7046364545822144, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8626217842102051, + "num_tokens": 171292369.0, + "step": 4489 + }, + { + "epoch": 0.5711741508713905, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.7301496267318726, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8739972710609436, + "num_tokens": 171323151.0, + "step": 4490 + }, + { + "epoch": 0.5713013611499809, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.4632097482681274, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8774916529655457, + "num_tokens": 171363273.0, + "step": 4491 + }, + { + "epoch": 0.5714285714285714, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5932197570800781, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8668117523193359, + "num_tokens": 171400893.0, + "step": 4492 + }, + { + "epoch": 0.5715557817071619, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.4993939399719238, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8648487329483032, + "num_tokens": 171441766.0, + "step": 4493 + }, + { + "epoch": 0.5716829919857525, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.7548154592514038, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8609016537666321, + "num_tokens": 171476128.0, + "step": 4494 + }, + { + "epoch": 0.571810202264343, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.7095154523849487, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8575279712677002, + "num_tokens": 171511624.0, + "step": 4495 + }, + { + "epoch": 0.5719374125429335, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5868239402770996, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8766690492630005, + "num_tokens": 171550665.0, + "step": 4496 + }, + { + "epoch": 0.5720646228215239, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.7106478214263916, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8663630485534668, + "num_tokens": 171583853.0, + "step": 4497 + }, + { + "epoch": 0.5721918331001145, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.53003990650177, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8693718910217285, + "num_tokens": 171622305.0, + "step": 4498 + }, + { + "epoch": 0.572319043378705, + "ewc_loss": 1.537799835205078e-05, + "grad_norm": 1.5649069547653198, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8637962341308594, + "num_tokens": 171662879.0, + "step": 4499 + }, + { + "epoch": 0.5724462536572955, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.561575174331665, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8564273715019226, + "num_tokens": 171702735.0, + "step": 4500 + }, + { + "epoch": 0.572573463935886, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6616499423980713, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8564987182617188, + "num_tokens": 171737977.0, + "step": 4501 + }, + { + "epoch": 0.5727006742144766, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5652527809143066, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8565604090690613, + "num_tokens": 171777392.0, + "step": 4502 + }, + { + "epoch": 0.572827884493067, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5336354970932007, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8621741533279419, + "num_tokens": 171819601.0, + "step": 4503 + }, + { + "epoch": 0.5729550947716575, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.552893877029419, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8707065582275391, + "num_tokens": 171853510.0, + "step": 4504 + }, + { + "epoch": 0.573082305050248, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7944538593292236, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8608253002166748, + "num_tokens": 171888437.0, + "step": 4505 + }, + { + "epoch": 0.5732095153288386, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6252357959747314, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.856050431728363, + "num_tokens": 171927883.0, + "step": 4506 + }, + { + "epoch": 0.5733367256074291, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6152782440185547, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8645289540290833, + "num_tokens": 171966081.0, + "step": 4507 + }, + { + "epoch": 0.5734639358860196, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.747149109840393, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8682571649551392, + "num_tokens": 171997152.0, + "step": 4508 + }, + { + "epoch": 0.5735911461646102, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5967885255813599, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8588722944259644, + "num_tokens": 172034765.0, + "step": 4509 + }, + { + "epoch": 0.5737183564432006, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5483962297439575, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8666117191314697, + "num_tokens": 172073399.0, + "step": 4510 + }, + { + "epoch": 0.5738455667217911, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6143665313720703, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.862480640411377, + "num_tokens": 172111059.0, + "step": 4511 + }, + { + "epoch": 0.5739727770003816, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.666759729385376, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8666422367095947, + "num_tokens": 172148169.0, + "step": 4512 + }, + { + "epoch": 0.5740999872789722, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5897655487060547, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8613299131393433, + "num_tokens": 172187344.0, + "step": 4513 + }, + { + "epoch": 0.5742271975575627, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5542515516281128, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8611962795257568, + "num_tokens": 172225270.0, + "step": 4514 + }, + { + "epoch": 0.5743544078361532, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.616546392440796, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8704102039337158, + "num_tokens": 172261990.0, + "step": 4515 + }, + { + "epoch": 0.5744816181147436, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7013697624206543, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8615120649337769, + "num_tokens": 172295379.0, + "step": 4516 + }, + { + "epoch": 0.5746088283933342, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5769453048706055, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8566328883171082, + "num_tokens": 172336473.0, + "step": 4517 + }, + { + "epoch": 0.5747360386719247, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5933761596679688, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8580734729766846, + "num_tokens": 172374581.0, + "step": 4518 + }, + { + "epoch": 0.5748632489505152, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6100667715072632, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8507505059242249, + "num_tokens": 172413782.0, + "step": 4519 + }, + { + "epoch": 0.5749904592291057, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6230676174163818, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8728378415107727, + "num_tokens": 172450173.0, + "step": 4520 + }, + { + "epoch": 0.5751176695076963, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6269279718399048, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.867684543132782, + "num_tokens": 172490410.0, + "step": 4521 + }, + { + "epoch": 0.5752448797862867, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5487009286880493, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8753308653831482, + "num_tokens": 172528205.0, + "step": 4522 + }, + { + "epoch": 0.5753720900648772, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5933738946914673, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8622704744338989, + "num_tokens": 172565902.0, + "step": 4523 + }, + { + "epoch": 0.5754993003434677, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5977590084075928, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.861916720867157, + "num_tokens": 172603404.0, + "step": 4524 + }, + { + "epoch": 0.5756265106220583, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5452463626861572, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8618290424346924, + "num_tokens": 172643626.0, + "step": 4525 + }, + { + "epoch": 0.5757537209006488, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7448832988739014, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8612302541732788, + "num_tokens": 172676248.0, + "step": 4526 + }, + { + "epoch": 0.5758809311792393, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6596860885620117, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8606042861938477, + "num_tokens": 172712669.0, + "step": 4527 + }, + { + "epoch": 0.5760081414578297, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4858973026275635, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8666940927505493, + "num_tokens": 172751778.0, + "step": 4528 + }, + { + "epoch": 0.5761353517364203, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6812539100646973, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8665306568145752, + "num_tokens": 172790537.0, + "step": 4529 + }, + { + "epoch": 0.5762625620150108, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6067291498184204, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.862346351146698, + "num_tokens": 172832242.0, + "step": 4530 + }, + { + "epoch": 0.5763897722936013, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6429294347763062, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8678508996963501, + "num_tokens": 172869348.0, + "step": 4531 + }, + { + "epoch": 0.5765169825721919, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4852828979492188, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8748236894607544, + "num_tokens": 172911774.0, + "step": 4532 + }, + { + "epoch": 0.5766441928507824, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 3.662081718444824, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8685847520828247, + "num_tokens": 172951342.0, + "step": 4533 + }, + { + "epoch": 0.5767714031293728, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6117662191390991, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8641982078552246, + "num_tokens": 172991160.0, + "step": 4534 + }, + { + "epoch": 0.5768986134079633, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5588983297348022, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8580661416053772, + "num_tokens": 173032151.0, + "step": 4535 + }, + { + "epoch": 0.5770258236865539, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5730499029159546, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8711546659469604, + "num_tokens": 173069956.0, + "step": 4536 + }, + { + "epoch": 0.5771530339651444, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6856895685195923, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8731873631477356, + "num_tokens": 173099334.0, + "step": 4537 + }, + { + "epoch": 0.5772802442437349, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6883221864700317, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8678491115570068, + "num_tokens": 173133231.0, + "step": 4538 + }, + { + "epoch": 0.5774074545223254, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5550414323806763, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8825783729553223, + "num_tokens": 173168338.0, + "step": 4539 + }, + { + "epoch": 0.5775346648009159, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5376369953155518, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8532856106758118, + "num_tokens": 173207854.0, + "step": 4540 + }, + { + "epoch": 0.5776618750795064, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6369223594665527, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8459311723709106, + "num_tokens": 173250600.0, + "step": 4541 + }, + { + "epoch": 0.5777890853580969, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7024075984954834, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8509618043899536, + "num_tokens": 173286981.0, + "step": 4542 + }, + { + "epoch": 0.5779162956366874, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6045258045196533, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8430997729301453, + "num_tokens": 173330191.0, + "step": 4543 + }, + { + "epoch": 0.578043505915278, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6587575674057007, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8555017709732056, + "num_tokens": 173370700.0, + "step": 4544 + }, + { + "epoch": 0.5781707161938685, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5766068696975708, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8693618774414062, + "num_tokens": 173411681.0, + "step": 4545 + }, + { + "epoch": 0.5782979264724589, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6789929866790771, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8556409478187561, + "num_tokens": 173447294.0, + "step": 4546 + }, + { + "epoch": 0.5784251367510495, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6165000200271606, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8680469989776611, + "num_tokens": 173484267.0, + "step": 4547 + }, + { + "epoch": 0.57855234702964, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7332948446273804, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8457369804382324, + "num_tokens": 173524419.0, + "step": 4548 + }, + { + "epoch": 0.5786795573082305, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7539763450622559, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.867109477519989, + "num_tokens": 173557029.0, + "step": 4549 + }, + { + "epoch": 0.578806767586821, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4999563694000244, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8541361689567566, + "num_tokens": 173602481.0, + "step": 4550 + }, + { + "epoch": 0.5789339778654116, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7370526790618896, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8779374957084656, + "num_tokens": 173639074.0, + "step": 4551 + }, + { + "epoch": 0.579061188144002, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7256442308425903, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.853951096534729, + "num_tokens": 173674339.0, + "step": 4552 + }, + { + "epoch": 0.5791883984225925, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.576000452041626, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8809919953346252, + "num_tokens": 173709198.0, + "step": 4553 + }, + { + "epoch": 0.579315608701183, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7256706953048706, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8605040311813354, + "num_tokens": 173740983.0, + "step": 4554 + }, + { + "epoch": 0.5794428189797736, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5192970037460327, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8479366898536682, + "num_tokens": 173782873.0, + "step": 4555 + }, + { + "epoch": 0.5795700292583641, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5469905138015747, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8730319738388062, + "num_tokens": 173821453.0, + "step": 4556 + }, + { + "epoch": 0.5796972395369546, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5656260251998901, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8637622594833374, + "num_tokens": 173860622.0, + "step": 4557 + }, + { + "epoch": 0.5798244498155452, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.935173749923706, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8614456653594971, + "num_tokens": 173895194.0, + "step": 4558 + }, + { + "epoch": 0.5799516600941356, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7302902936935425, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8629205226898193, + "num_tokens": 173928390.0, + "step": 4559 + }, + { + "epoch": 0.5800788703727261, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6805881261825562, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8590524196624756, + "num_tokens": 173969201.0, + "step": 4560 + }, + { + "epoch": 0.5802060806513166, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7023054361343384, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8755284547805786, + "num_tokens": 173999362.0, + "step": 4561 + }, + { + "epoch": 0.5803332909299072, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6628434658050537, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8588299751281738, + "num_tokens": 174037576.0, + "step": 4562 + }, + { + "epoch": 0.5804605012084977, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.63445246219635, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8664712309837341, + "num_tokens": 174071439.0, + "step": 4563 + }, + { + "epoch": 0.5805877114870882, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6148834228515625, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8637751340866089, + "num_tokens": 174108653.0, + "step": 4564 + }, + { + "epoch": 0.5807149217656786, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6457995176315308, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8590640425682068, + "num_tokens": 174149710.0, + "step": 4565 + }, + { + "epoch": 0.5808421320442692, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5738036632537842, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8708276152610779, + "num_tokens": 174185407.0, + "step": 4566 + }, + { + "epoch": 0.5809693423228597, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5824097394943237, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8700467944145203, + "num_tokens": 174228129.0, + "step": 4567 + }, + { + "epoch": 0.5810965526014502, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.655590534210205, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8569202423095703, + "num_tokens": 174265774.0, + "step": 4568 + }, + { + "epoch": 0.5812237628800407, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5407129526138306, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8663759231567383, + "num_tokens": 174304319.0, + "step": 4569 + }, + { + "epoch": 0.5813509731586313, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5590643882751465, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.859026312828064, + "num_tokens": 174343161.0, + "step": 4570 + }, + { + "epoch": 0.5814781834372217, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5178600549697876, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8711676597595215, + "num_tokens": 174381802.0, + "step": 4571 + }, + { + "epoch": 0.5816053937158122, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5874314308166504, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8590058088302612, + "num_tokens": 174418570.0, + "step": 4572 + }, + { + "epoch": 0.5817326039944027, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6097583770751953, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8667958378791809, + "num_tokens": 174462962.0, + "step": 4573 + }, + { + "epoch": 0.5818598142729933, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5810301303863525, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8747721910476685, + "num_tokens": 174498146.0, + "step": 4574 + }, + { + "epoch": 0.5819870245515838, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.728736400604248, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8605166673660278, + "num_tokens": 174529595.0, + "step": 4575 + }, + { + "epoch": 0.5821142348301743, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.667762041091919, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8606794476509094, + "num_tokens": 174566469.0, + "step": 4576 + }, + { + "epoch": 0.5822414451087647, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5372368097305298, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8591355681419373, + "num_tokens": 174607651.0, + "step": 4577 + }, + { + "epoch": 0.5823686553873553, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5454236268997192, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8690199255943298, + "num_tokens": 174647786.0, + "step": 4578 + }, + { + "epoch": 0.5824958656659458, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6807830333709717, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.853360652923584, + "num_tokens": 174683057.0, + "step": 4579 + }, + { + "epoch": 0.5826230759445363, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6002156734466553, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8531218767166138, + "num_tokens": 174720324.0, + "step": 4580 + }, + { + "epoch": 0.5827502862231269, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5907286405563354, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8530504107475281, + "num_tokens": 174758785.0, + "step": 4581 + }, + { + "epoch": 0.5828774965017174, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.577574372291565, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8510491847991943, + "num_tokens": 174797699.0, + "step": 4582 + }, + { + "epoch": 0.5830047067803078, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.56650972366333, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8694700002670288, + "num_tokens": 174839816.0, + "step": 4583 + }, + { + "epoch": 0.5831319170588983, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.602280616760254, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8655617237091064, + "num_tokens": 174880144.0, + "step": 4584 + }, + { + "epoch": 0.5832591273374889, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6565930843353271, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8625734448432922, + "num_tokens": 174914362.0, + "step": 4585 + }, + { + "epoch": 0.5833863376160794, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.41944420337677, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8664166927337646, + "num_tokens": 174959254.0, + "step": 4586 + }, + { + "epoch": 0.5835135478946699, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6330554485321045, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8599154353141785, + "num_tokens": 174998010.0, + "step": 4587 + }, + { + "epoch": 0.5836407581732604, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4337126016616821, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8678130507469177, + "num_tokens": 175043058.0, + "step": 4588 + }, + { + "epoch": 0.5837679684518509, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5914355516433716, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8675729632377625, + "num_tokens": 175081749.0, + "step": 4589 + }, + { + "epoch": 0.5838951787304414, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6560348272323608, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8570957779884338, + "num_tokens": 175119903.0, + "step": 4590 + }, + { + "epoch": 0.5840223890090319, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6687532663345337, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8587044477462769, + "num_tokens": 175153482.0, + "step": 4591 + }, + { + "epoch": 0.5841495992876224, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5381202697753906, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8682231903076172, + "num_tokens": 175193652.0, + "step": 4592 + }, + { + "epoch": 0.584276809566213, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.601298213005066, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8505921363830566, + "num_tokens": 175236119.0, + "step": 4593 + }, + { + "epoch": 0.5844040198448035, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6195793151855469, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.855429470539093, + "num_tokens": 175271951.0, + "step": 4594 + }, + { + "epoch": 0.5845312301233939, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6099324226379395, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8639854788780212, + "num_tokens": 175308680.0, + "step": 4595 + }, + { + "epoch": 0.5846584404019844, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5307005643844604, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8613523244857788, + "num_tokens": 175348197.0, + "step": 4596 + }, + { + "epoch": 0.584785650680575, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5324773788452148, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8568013906478882, + "num_tokens": 175388886.0, + "step": 4597 + }, + { + "epoch": 0.5849128609591655, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5352689027786255, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8674514293670654, + "num_tokens": 175427264.0, + "step": 4598 + }, + { + "epoch": 0.585040071237756, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.411892056465149, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8698910474777222, + "num_tokens": 175470733.0, + "step": 4599 + }, + { + "epoch": 0.5851672815163466, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5970287322998047, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8672128915786743, + "num_tokens": 175508502.0, + "step": 4600 + }, + { + "epoch": 0.585294491794937, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6309336423873901, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.86083984375, + "num_tokens": 175546068.0, + "step": 4601 + }, + { + "epoch": 0.5854217020735275, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6265796422958374, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8661031723022461, + "num_tokens": 175580998.0, + "step": 4602 + }, + { + "epoch": 0.585548912352118, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5982297658920288, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8546978831291199, + "num_tokens": 175619660.0, + "step": 4603 + }, + { + "epoch": 0.5856761226307086, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7226425409317017, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8501853346824646, + "num_tokens": 175656513.0, + "step": 4604 + }, + { + "epoch": 0.5858033329092991, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.617536187171936, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8629006147384644, + "num_tokens": 175695778.0, + "step": 4605 + }, + { + "epoch": 0.5859305431878896, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.698715329170227, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8587944507598877, + "num_tokens": 175729235.0, + "step": 4606 + }, + { + "epoch": 0.5860577534664801, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5736353397369385, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8709515333175659, + "num_tokens": 175764214.0, + "step": 4607 + }, + { + "epoch": 0.5861849637450706, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4796334505081177, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8645974397659302, + "num_tokens": 175809538.0, + "step": 4608 + }, + { + "epoch": 0.5863121740236611, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.661012053489685, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8396877646446228, + "num_tokens": 175847729.0, + "step": 4609 + }, + { + "epoch": 0.5864393843022516, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6200507879257202, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8599387407302856, + "num_tokens": 175882675.0, + "step": 4610 + }, + { + "epoch": 0.5865665945808421, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.522619366645813, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8789226412773132, + "num_tokens": 175920265.0, + "step": 4611 + }, + { + "epoch": 0.5866938048594327, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7659990787506104, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8569875955581665, + "num_tokens": 175957780.0, + "step": 4612 + }, + { + "epoch": 0.5868210151380232, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8350920677185059, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8678608536720276, + "num_tokens": 175988268.0, + "step": 4613 + }, + { + "epoch": 0.5869482254166136, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.544848918914795, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8689683675765991, + "num_tokens": 176029554.0, + "step": 4614 + }, + { + "epoch": 0.5870754356952042, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.579828143119812, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.861956775188446, + "num_tokens": 176065230.0, + "step": 4615 + }, + { + "epoch": 0.5872026459737947, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6513429880142212, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8650345802307129, + "num_tokens": 176097938.0, + "step": 4616 + }, + { + "epoch": 0.5873298562523852, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8319180011749268, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8408781290054321, + "num_tokens": 176129472.0, + "step": 4617 + }, + { + "epoch": 0.5874570665309757, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.686824083328247, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8579841256141663, + "num_tokens": 176165549.0, + "step": 4618 + }, + { + "epoch": 0.5875842768095663, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7239640951156616, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8588216304779053, + "num_tokens": 176204381.0, + "step": 4619 + }, + { + "epoch": 0.5877114870881567, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5685209035873413, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8606284856796265, + "num_tokens": 176244162.0, + "step": 4620 + }, + { + "epoch": 0.5878386973667472, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5195293426513672, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8534457087516785, + "num_tokens": 176288209.0, + "step": 4621 + }, + { + "epoch": 0.5879659076453377, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5402752161026, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8712817430496216, + "num_tokens": 176327126.0, + "step": 4622 + }, + { + "epoch": 0.5880931179239283, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6315584182739258, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8768883943557739, + "num_tokens": 176364155.0, + "step": 4623 + }, + { + "epoch": 0.5882203282025188, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.666431188583374, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8752277493476868, + "num_tokens": 176398180.0, + "step": 4624 + }, + { + "epoch": 0.5883475384811093, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5321677923202515, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8624148368835449, + "num_tokens": 176441474.0, + "step": 4625 + }, + { + "epoch": 0.5884747487596997, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5600308179855347, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.857507050037384, + "num_tokens": 176482973.0, + "step": 4626 + }, + { + "epoch": 0.5886019590382903, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.661468744277954, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8644791841506958, + "num_tokens": 176517629.0, + "step": 4627 + }, + { + "epoch": 0.5887291693168808, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.615932583808899, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8606675863265991, + "num_tokens": 176554551.0, + "step": 4628 + }, + { + "epoch": 0.5888563795954713, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7131744623184204, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8610935211181641, + "num_tokens": 176592192.0, + "step": 4629 + }, + { + "epoch": 0.5889835898740619, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.587188959121704, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8706789016723633, + "num_tokens": 176628389.0, + "step": 4630 + }, + { + "epoch": 0.5891108001526524, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5216439962387085, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8589596748352051, + "num_tokens": 176672729.0, + "step": 4631 + }, + { + "epoch": 0.5892380104312428, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5490235090255737, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8479894399642944, + "num_tokens": 176715562.0, + "step": 4632 + }, + { + "epoch": 0.5893652207098333, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5595662593841553, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8721787333488464, + "num_tokens": 176750508.0, + "step": 4633 + }, + { + "epoch": 0.5894924309884239, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7069694995880127, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8567385673522949, + "num_tokens": 176785307.0, + "step": 4634 + }, + { + "epoch": 0.5896196412670144, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6245791912078857, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8729273080825806, + "num_tokens": 176820490.0, + "step": 4635 + }, + { + "epoch": 0.5897468515456049, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.55149507522583, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8620126247406006, + "num_tokens": 176860414.0, + "step": 4636 + }, + { + "epoch": 0.5898740618241954, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6563173532485962, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8671831488609314, + "num_tokens": 176899551.0, + "step": 4637 + }, + { + "epoch": 0.5900012721027859, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.707970380783081, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8685681223869324, + "num_tokens": 176932608.0, + "step": 4638 + }, + { + "epoch": 0.5901284823813764, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5884839296340942, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8447096347808838, + "num_tokens": 176975487.0, + "step": 4639 + }, + { + "epoch": 0.5902556926599669, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.535656452178955, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8682630658149719, + "num_tokens": 177013236.0, + "step": 4640 + }, + { + "epoch": 0.5903829029385574, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5345845222473145, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8544965982437134, + "num_tokens": 177053637.0, + "step": 4641 + }, + { + "epoch": 0.590510113217148, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6154578924179077, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8625860810279846, + "num_tokens": 177093581.0, + "step": 4642 + }, + { + "epoch": 0.5906373234957385, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.65180504322052, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8701999187469482, + "num_tokens": 177129782.0, + "step": 4643 + }, + { + "epoch": 0.5907645337743289, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6562081575393677, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8589110374450684, + "num_tokens": 177168240.0, + "step": 4644 + }, + { + "epoch": 0.5908917440529194, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5398467779159546, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8576123714447021, + "num_tokens": 177211743.0, + "step": 4645 + }, + { + "epoch": 0.59101895433151, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.599567174911499, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8690043687820435, + "num_tokens": 177250784.0, + "step": 4646 + }, + { + "epoch": 0.5911461646101005, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.600616216659546, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8615918159484863, + "num_tokens": 177289354.0, + "step": 4647 + }, + { + "epoch": 0.591273374888691, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.801613211631775, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8632156848907471, + "num_tokens": 177324649.0, + "step": 4648 + }, + { + "epoch": 0.5914005851672816, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6585553884506226, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8692590594291687, + "num_tokens": 177365281.0, + "step": 4649 + }, + { + "epoch": 0.591527795445872, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6090296506881714, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8655894994735718, + "num_tokens": 177401678.0, + "step": 4650 + }, + { + "epoch": 0.5916550057244625, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5877999067306519, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8376369476318359, + "num_tokens": 177442374.0, + "step": 4651 + }, + { + "epoch": 0.591782216003053, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6946700811386108, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8679057359695435, + "num_tokens": 177477825.0, + "step": 4652 + }, + { + "epoch": 0.5919094262816436, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5397862195968628, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8689478635787964, + "num_tokens": 177516405.0, + "step": 4653 + }, + { + "epoch": 0.5920366365602341, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6156831979751587, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8757709264755249, + "num_tokens": 177551240.0, + "step": 4654 + }, + { + "epoch": 0.5921638468388246, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5681663751602173, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8605848550796509, + "num_tokens": 177592889.0, + "step": 4655 + }, + { + "epoch": 0.592291057117415, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6511366367340088, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8650722503662109, + "num_tokens": 177630610.0, + "step": 4656 + }, + { + "epoch": 0.5924182673960056, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6288071870803833, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.853058934211731, + "num_tokens": 177669257.0, + "step": 4657 + }, + { + "epoch": 0.5925454776745961, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5794252157211304, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8740907907485962, + "num_tokens": 177705965.0, + "step": 4658 + }, + { + "epoch": 0.5926726879531866, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.642643928527832, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8480875492095947, + "num_tokens": 177743753.0, + "step": 4659 + }, + { + "epoch": 0.5927998982317771, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7267770767211914, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8603627681732178, + "num_tokens": 177774742.0, + "step": 4660 + }, + { + "epoch": 0.5929271085103677, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5606017112731934, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8759638071060181, + "num_tokens": 177813326.0, + "step": 4661 + }, + { + "epoch": 0.5930543187889582, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.594925880432129, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8576987981796265, + "num_tokens": 177851014.0, + "step": 4662 + }, + { + "epoch": 0.5931815290675486, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6213129758834839, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8508882522583008, + "num_tokens": 177889745.0, + "step": 4663 + }, + { + "epoch": 0.5933087393461391, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6005957126617432, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8637247681617737, + "num_tokens": 177927354.0, + "step": 4664 + }, + { + "epoch": 0.5934359496247297, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7087610960006714, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8588688969612122, + "num_tokens": 177962366.0, + "step": 4665 + }, + { + "epoch": 0.5935631599033202, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6883183717727661, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8576387166976929, + "num_tokens": 177997424.0, + "step": 4666 + }, + { + "epoch": 0.5936903701819107, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6404283046722412, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8565394878387451, + "num_tokens": 178034030.0, + "step": 4667 + }, + { + "epoch": 0.5938175804605013, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6243641376495361, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8713163137435913, + "num_tokens": 178069237.0, + "step": 4668 + }, + { + "epoch": 0.5939447907390917, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.584882378578186, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8545386791229248, + "num_tokens": 178109482.0, + "step": 4669 + }, + { + "epoch": 0.5940720010176822, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.568412184715271, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8620488047599792, + "num_tokens": 178151432.0, + "step": 4670 + }, + { + "epoch": 0.5941992112962727, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.567162275314331, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.850458562374115, + "num_tokens": 178191102.0, + "step": 4671 + }, + { + "epoch": 0.5943264215748633, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6459431648254395, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8773022294044495, + "num_tokens": 178223208.0, + "step": 4672 + }, + { + "epoch": 0.5944536318534538, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6669636964797974, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8580777645111084, + "num_tokens": 178256134.0, + "step": 4673 + }, + { + "epoch": 0.5945808421320443, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.53374445438385, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.866557240486145, + "num_tokens": 178297562.0, + "step": 4674 + }, + { + "epoch": 0.5947080524106347, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6208020448684692, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8722001910209656, + "num_tokens": 178338956.0, + "step": 4675 + }, + { + "epoch": 0.5948352626892253, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.542724609375, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8680845499038696, + "num_tokens": 178376881.0, + "step": 4676 + }, + { + "epoch": 0.5949624729678158, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7965964078903198, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.86403888463974, + "num_tokens": 178411466.0, + "step": 4677 + }, + { + "epoch": 0.5950896832464063, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.699681043624878, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8647772073745728, + "num_tokens": 178446669.0, + "step": 4678 + }, + { + "epoch": 0.5952168935249968, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.751731514930725, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8417915105819702, + "num_tokens": 178481012.0, + "step": 4679 + }, + { + "epoch": 0.5953441038035874, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.452014684677124, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8673925399780273, + "num_tokens": 178524897.0, + "step": 4680 + }, + { + "epoch": 0.5954713140821778, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5838017463684082, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8598459362983704, + "num_tokens": 178564898.0, + "step": 4681 + }, + { + "epoch": 0.5955985243607683, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5257941484451294, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8580601811408997, + "num_tokens": 178604627.0, + "step": 4682 + }, + { + "epoch": 0.5957257346393589, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.575963020324707, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8637357950210571, + "num_tokens": 178643483.0, + "step": 4683 + }, + { + "epoch": 0.5958529449179494, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.680846095085144, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8578759431838989, + "num_tokens": 178680379.0, + "step": 4684 + }, + { + "epoch": 0.5959801551965399, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5483680963516235, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8892598748207092, + "num_tokens": 178714528.0, + "step": 4685 + }, + { + "epoch": 0.5961073654751304, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6239136457443237, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.865782618522644, + "num_tokens": 178755324.0, + "step": 4686 + }, + { + "epoch": 0.5962345757537209, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.520379900932312, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8715752959251404, + "num_tokens": 178791897.0, + "step": 4687 + }, + { + "epoch": 0.5963617860323114, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7444071769714355, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8633607625961304, + "num_tokens": 178824595.0, + "step": 4688 + }, + { + "epoch": 0.5964889963109019, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7153617143630981, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8764277696609497, + "num_tokens": 178862264.0, + "step": 4689 + }, + { + "epoch": 0.5966162065894924, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6201505661010742, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8516772389411926, + "num_tokens": 178904131.0, + "step": 4690 + }, + { + "epoch": 0.596743416868083, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6215351819992065, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8691691160202026, + "num_tokens": 178942570.0, + "step": 4691 + }, + { + "epoch": 0.5968706271466735, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5965542793273926, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8601008057594299, + "num_tokens": 178983954.0, + "step": 4692 + }, + { + "epoch": 0.5969978374252639, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5835379362106323, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8542362451553345, + "num_tokens": 179025634.0, + "step": 4693 + }, + { + "epoch": 0.5971250477038544, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6996185779571533, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.855602502822876, + "num_tokens": 179062797.0, + "step": 4694 + }, + { + "epoch": 0.597252257982445, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6695377826690674, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8612591028213501, + "num_tokens": 179097801.0, + "step": 4695 + }, + { + "epoch": 0.5973794682610355, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6850255727767944, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8431409597396851, + "num_tokens": 179133078.0, + "step": 4696 + }, + { + "epoch": 0.597506678539626, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6173789501190186, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8661571145057678, + "num_tokens": 179172716.0, + "step": 4697 + }, + { + "epoch": 0.5976338888182166, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.774139165878296, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8633474111557007, + "num_tokens": 179206390.0, + "step": 4698 + }, + { + "epoch": 0.597761099096807, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6831181049346924, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8659930229187012, + "num_tokens": 179239665.0, + "step": 4699 + }, + { + "epoch": 0.5978883093753975, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4270073175430298, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8784726858139038, + "num_tokens": 179279944.0, + "step": 4700 + }, + { + "epoch": 0.598015519653988, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.561493992805481, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8579773902893066, + "num_tokens": 179319527.0, + "step": 4701 + }, + { + "epoch": 0.5981427299325786, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5460288524627686, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8596397042274475, + "num_tokens": 179358674.0, + "step": 4702 + }, + { + "epoch": 0.5982699402111691, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4570919275283813, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8691713809967041, + "num_tokens": 179403331.0, + "step": 4703 + }, + { + "epoch": 0.5983971504897596, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5972524881362915, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.862195611000061, + "num_tokens": 179440091.0, + "step": 4704 + }, + { + "epoch": 0.59852436076835, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7048604488372803, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8612267374992371, + "num_tokens": 179474277.0, + "step": 4705 + }, + { + "epoch": 0.5986515710469406, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4392496347427368, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8735889196395874, + "num_tokens": 179516756.0, + "step": 4706 + }, + { + "epoch": 0.5987787813255311, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6076141595840454, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8690171837806702, + "num_tokens": 179550813.0, + "step": 4707 + }, + { + "epoch": 0.5989059916041216, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5734920501708984, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8610591292381287, + "num_tokens": 179593040.0, + "step": 4708 + }, + { + "epoch": 0.5990332018827121, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7262593507766724, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8448816537857056, + "num_tokens": 179629034.0, + "step": 4709 + }, + { + "epoch": 0.5991604121613027, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5561764240264893, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8516907095909119, + "num_tokens": 179673674.0, + "step": 4710 + }, + { + "epoch": 0.5992876224398932, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4968483448028564, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8636949062347412, + "num_tokens": 179712829.0, + "step": 4711 + }, + { + "epoch": 0.5994148327184836, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.541162371635437, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8713090419769287, + "num_tokens": 179749830.0, + "step": 4712 + }, + { + "epoch": 0.5995420429970741, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5909976959228516, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8538028001785278, + "num_tokens": 179790594.0, + "step": 4713 + }, + { + "epoch": 0.5996692532756647, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.586351990699768, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8641608357429504, + "num_tokens": 179831138.0, + "step": 4714 + }, + { + "epoch": 0.5997964635542552, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5363998413085938, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8617898225784302, + "num_tokens": 179869954.0, + "step": 4715 + }, + { + "epoch": 0.5999236738328457, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.623889446258545, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8484423160552979, + "num_tokens": 179905577.0, + "step": 4716 + }, + { + "epoch": 0.6000508841114363, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.569993257522583, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8722105026245117, + "num_tokens": 179940966.0, + "step": 4717 + }, + { + "epoch": 0.6001780943900267, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6088923215866089, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8592718243598938, + "num_tokens": 179976814.0, + "step": 4718 + }, + { + "epoch": 0.6003053046686172, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.704314947128296, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8524422645568848, + "num_tokens": 180013398.0, + "step": 4719 + }, + { + "epoch": 0.6004325149472077, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5972791910171509, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8647626638412476, + "num_tokens": 180051370.0, + "step": 4720 + }, + { + "epoch": 0.6005597252257983, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.698894739151001, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8565703630447388, + "num_tokens": 180087071.0, + "step": 4721 + }, + { + "epoch": 0.6006869355043888, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5139223337173462, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.863878607749939, + "num_tokens": 180131576.0, + "step": 4722 + }, + { + "epoch": 0.6008141457829793, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5739057064056396, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8684332370758057, + "num_tokens": 180170753.0, + "step": 4723 + }, + { + "epoch": 0.6009413560615697, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5383050441741943, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8665972352027893, + "num_tokens": 180207110.0, + "step": 4724 + }, + { + "epoch": 0.6010685663401603, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6547359228134155, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8594557642936707, + "num_tokens": 180238914.0, + "step": 4725 + }, + { + "epoch": 0.6011957766187508, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7808963060379028, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8593224287033081, + "num_tokens": 180271799.0, + "step": 4726 + }, + { + "epoch": 0.6013229868973413, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6834050416946411, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8783553838729858, + "num_tokens": 180306693.0, + "step": 4727 + }, + { + "epoch": 0.6014501971759318, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.586097240447998, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.873167097568512, + "num_tokens": 180341698.0, + "step": 4728 + }, + { + "epoch": 0.6015774074545224, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5581638813018799, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8801102638244629, + "num_tokens": 180377994.0, + "step": 4729 + }, + { + "epoch": 0.6017046177331128, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6902377605438232, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8481767773628235, + "num_tokens": 180414384.0, + "step": 4730 + }, + { + "epoch": 0.6018318280117033, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.758994460105896, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8610367178916931, + "num_tokens": 180448174.0, + "step": 4731 + }, + { + "epoch": 0.6019590382902938, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5334134101867676, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8735995292663574, + "num_tokens": 180489780.0, + "step": 4732 + }, + { + "epoch": 0.6020862485688844, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5731632709503174, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8692123293876648, + "num_tokens": 180531289.0, + "step": 4733 + }, + { + "epoch": 0.6022134588474749, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7141081094741821, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8538657426834106, + "num_tokens": 180567820.0, + "step": 4734 + }, + { + "epoch": 0.6023406691260654, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6019238233566284, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8498305678367615, + "num_tokens": 180606745.0, + "step": 4735 + }, + { + "epoch": 0.6024678794046558, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6160264015197754, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8634878396987915, + "num_tokens": 180642040.0, + "step": 4736 + }, + { + "epoch": 0.6025950896832464, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5588375329971313, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8709328770637512, + "num_tokens": 180679113.0, + "step": 4737 + }, + { + "epoch": 0.6027222999618369, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.685678482055664, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8786750435829163, + "num_tokens": 180713742.0, + "step": 4738 + }, + { + "epoch": 0.6028495102404274, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6613171100616455, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8630484938621521, + "num_tokens": 180749473.0, + "step": 4739 + }, + { + "epoch": 0.602976720519018, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7236428260803223, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8688391447067261, + "num_tokens": 180783242.0, + "step": 4740 + }, + { + "epoch": 0.6031039307976085, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5947929620742798, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8597665429115295, + "num_tokens": 180823139.0, + "step": 4741 + }, + { + "epoch": 0.6032311410761989, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.723635196685791, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8483845591545105, + "num_tokens": 180858724.0, + "step": 4742 + }, + { + "epoch": 0.6033583513547894, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.748561143875122, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8691593408584595, + "num_tokens": 180894034.0, + "step": 4743 + }, + { + "epoch": 0.60348556163338, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.517930507659912, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8703582286834717, + "num_tokens": 180933049.0, + "step": 4744 + }, + { + "epoch": 0.6036127719119705, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6268104314804077, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8751904368400574, + "num_tokens": 180966416.0, + "step": 4745 + }, + { + "epoch": 0.603739982190561, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7823901176452637, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8568558692932129, + "num_tokens": 181003972.0, + "step": 4746 + }, + { + "epoch": 0.6038671924691515, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5979795455932617, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8628392219543457, + "num_tokens": 181043269.0, + "step": 4747 + }, + { + "epoch": 0.603994402747742, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.417811393737793, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8852084875106812, + "num_tokens": 181084009.0, + "step": 4748 + }, + { + "epoch": 0.6041216130263325, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5081052780151367, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8655118942260742, + "num_tokens": 181124927.0, + "step": 4749 + }, + { + "epoch": 0.604248823304923, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6973391771316528, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8577677011489868, + "num_tokens": 181158987.0, + "step": 4750 + }, + { + "epoch": 0.6043760335835135, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5971912145614624, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8427455425262451, + "num_tokens": 181198281.0, + "step": 4751 + }, + { + "epoch": 0.6045032438621041, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6742918491363525, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8632128238677979, + "num_tokens": 181233169.0, + "step": 4752 + }, + { + "epoch": 0.6046304541406946, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6558562517166138, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8484456539154053, + "num_tokens": 181268432.0, + "step": 4753 + }, + { + "epoch": 0.604757664419285, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5616562366485596, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.857962965965271, + "num_tokens": 181307430.0, + "step": 4754 + }, + { + "epoch": 0.6048848746978756, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.693010687828064, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8390324115753174, + "num_tokens": 181345092.0, + "step": 4755 + }, + { + "epoch": 0.6050120849764661, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5835262537002563, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8602614402770996, + "num_tokens": 181381621.0, + "step": 4756 + }, + { + "epoch": 0.6051392952550566, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.571750521659851, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8641193509101868, + "num_tokens": 181423613.0, + "step": 4757 + }, + { + "epoch": 0.6052665055336471, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5426032543182373, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8714767694473267, + "num_tokens": 181462559.0, + "step": 4758 + }, + { + "epoch": 0.6053937158122377, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.552314043045044, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8543509840965271, + "num_tokens": 181498713.0, + "step": 4759 + }, + { + "epoch": 0.6055209260908282, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6376625299453735, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8654069900512695, + "num_tokens": 181531001.0, + "step": 4760 + }, + { + "epoch": 0.6056481363694186, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6168795824050903, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8673192262649536, + "num_tokens": 181569657.0, + "step": 4761 + }, + { + "epoch": 0.6057753466480091, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5594638586044312, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8783196210861206, + "num_tokens": 181605731.0, + "step": 4762 + }, + { + "epoch": 0.6059025569265997, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5785088539123535, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8557754755020142, + "num_tokens": 181648829.0, + "step": 4763 + }, + { + "epoch": 0.6060297672051902, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4934228658676147, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8473976850509644, + "num_tokens": 181693762.0, + "step": 4764 + }, + { + "epoch": 0.6061569774837807, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8788676261901855, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.872804582118988, + "num_tokens": 181730462.0, + "step": 4765 + }, + { + "epoch": 0.6062841877623713, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6156620979309082, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8337655067443848, + "num_tokens": 181772837.0, + "step": 4766 + }, + { + "epoch": 0.6064113980409617, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4631673097610474, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8542376756668091, + "num_tokens": 181818505.0, + "step": 4767 + }, + { + "epoch": 0.6065386083195522, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6935315132141113, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8499776124954224, + "num_tokens": 181856627.0, + "step": 4768 + }, + { + "epoch": 0.6066658185981427, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5866529941558838, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8510302305221558, + "num_tokens": 181897263.0, + "step": 4769 + }, + { + "epoch": 0.6067930288767333, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6403331756591797, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8687359094619751, + "num_tokens": 181932200.0, + "step": 4770 + }, + { + "epoch": 0.6069202391553238, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7486028671264648, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8635873198509216, + "num_tokens": 181963683.0, + "step": 4771 + }, + { + "epoch": 0.6070474494339143, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.699102520942688, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8464311361312866, + "num_tokens": 181999822.0, + "step": 4772 + }, + { + "epoch": 0.6071746597125047, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6118268966674805, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8740619421005249, + "num_tokens": 182033103.0, + "step": 4773 + }, + { + "epoch": 0.6073018699910953, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.502366542816162, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8774718046188354, + "num_tokens": 182072245.0, + "step": 4774 + }, + { + "epoch": 0.6074290802696858, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4617657661437988, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8590278625488281, + "num_tokens": 182113578.0, + "step": 4775 + }, + { + "epoch": 0.6075562905482763, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6286662817001343, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8866719007492065, + "num_tokens": 182147604.0, + "step": 4776 + }, + { + "epoch": 0.6076835008268668, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5882924795150757, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8674038648605347, + "num_tokens": 182184900.0, + "step": 4777 + }, + { + "epoch": 0.6078107111054574, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6175543069839478, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8615902066230774, + "num_tokens": 182220932.0, + "step": 4778 + }, + { + "epoch": 0.6079379213840478, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 3.74424147605896, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8580306768417358, + "num_tokens": 182259815.0, + "step": 4779 + }, + { + "epoch": 0.6080651316626383, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5939310789108276, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8765673637390137, + "num_tokens": 182292245.0, + "step": 4780 + }, + { + "epoch": 0.6081923419412288, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5551799535751343, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8721054792404175, + "num_tokens": 182330733.0, + "step": 4781 + }, + { + "epoch": 0.6083195522198194, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6376210451126099, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8643643260002136, + "num_tokens": 182364261.0, + "step": 4782 + }, + { + "epoch": 0.6084467624984099, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6927920579910278, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8577404022216797, + "num_tokens": 182400723.0, + "step": 4783 + }, + { + "epoch": 0.6085739727770004, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5665316581726074, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8498407006263733, + "num_tokens": 182441087.0, + "step": 4784 + }, + { + "epoch": 0.6087011830555908, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7231813669204712, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8520316481590271, + "num_tokens": 182477123.0, + "step": 4785 + }, + { + "epoch": 0.6088283933341814, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7473783493041992, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8524987697601318, + "num_tokens": 182510213.0, + "step": 4786 + }, + { + "epoch": 0.6089556036127719, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5551934242248535, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8658663034439087, + "num_tokens": 182548773.0, + "step": 4787 + }, + { + "epoch": 0.6090828138913624, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.605945348739624, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8544434905052185, + "num_tokens": 182586507.0, + "step": 4788 + }, + { + "epoch": 0.609210024169953, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6152995824813843, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8639293909072876, + "num_tokens": 182622571.0, + "step": 4789 + }, + { + "epoch": 0.6093372344485435, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6126391887664795, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8481886386871338, + "num_tokens": 182662240.0, + "step": 4790 + }, + { + "epoch": 0.6094644447271339, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6497817039489746, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8539783954620361, + "num_tokens": 182700175.0, + "step": 4791 + }, + { + "epoch": 0.6095916550057244, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6462057828903198, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8539326190948486, + "num_tokens": 182738356.0, + "step": 4792 + }, + { + "epoch": 0.609718865284315, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4930216073989868, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8615626096725464, + "num_tokens": 182777810.0, + "step": 4793 + }, + { + "epoch": 0.6098460755629055, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5885365009307861, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8730635643005371, + "num_tokens": 182814944.0, + "step": 4794 + }, + { + "epoch": 0.609973285841496, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5081512928009033, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8633131980895996, + "num_tokens": 182855325.0, + "step": 4795 + }, + { + "epoch": 0.6101004961200865, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5567739009857178, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8732611536979675, + "num_tokens": 182893196.0, + "step": 4796 + }, + { + "epoch": 0.610227706398677, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5575014352798462, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8714337944984436, + "num_tokens": 182929993.0, + "step": 4797 + }, + { + "epoch": 0.6103549166772675, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5582138299942017, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.853535532951355, + "num_tokens": 182973655.0, + "step": 4798 + }, + { + "epoch": 0.610482126955858, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6547956466674805, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8671844005584717, + "num_tokens": 183008062.0, + "step": 4799 + }, + { + "epoch": 0.6106093372344485, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7228988409042358, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.858128547668457, + "num_tokens": 183044596.0, + "step": 4800 + }, + { + "epoch": 0.6107365475130391, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.657895565032959, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8454070091247559, + "num_tokens": 183083229.0, + "step": 4801 + }, + { + "epoch": 0.6108637577916296, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6152615547180176, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.859607458114624, + "num_tokens": 183121788.0, + "step": 4802 + }, + { + "epoch": 0.61099096807022, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.9487743377685547, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8551890254020691, + "num_tokens": 183152634.0, + "step": 4803 + }, + { + "epoch": 0.6111181783488105, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5712794065475464, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8718585968017578, + "num_tokens": 183189273.0, + "step": 4804 + }, + { + "epoch": 0.6112453886274011, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6743663549423218, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.866447925567627, + "num_tokens": 183228092.0, + "step": 4805 + }, + { + "epoch": 0.6113725989059916, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7093538045883179, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8680151700973511, + "num_tokens": 183261625.0, + "step": 4806 + }, + { + "epoch": 0.6114998091845821, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5693782567977905, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8774632215499878, + "num_tokens": 183301989.0, + "step": 4807 + }, + { + "epoch": 0.6116270194631727, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5529167652130127, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8747531175613403, + "num_tokens": 183344518.0, + "step": 4808 + }, + { + "epoch": 0.6117542297417632, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7978427410125732, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8657563924789429, + "num_tokens": 183376082.0, + "step": 4809 + }, + { + "epoch": 0.6118814400203536, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6223739385604858, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8501420617103577, + "num_tokens": 183414608.0, + "step": 4810 + }, + { + "epoch": 0.6120086502989441, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6507669687271118, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8700382709503174, + "num_tokens": 183451091.0, + "step": 4811 + }, + { + "epoch": 0.6121358605775347, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5744094848632812, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8620653748512268, + "num_tokens": 183490286.0, + "step": 4812 + }, + { + "epoch": 0.6122630708561252, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5949617624282837, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8496978282928467, + "num_tokens": 183529298.0, + "step": 4813 + }, + { + "epoch": 0.6123902811347157, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4956905841827393, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8640604615211487, + "num_tokens": 183573298.0, + "step": 4814 + }, + { + "epoch": 0.6125174914133062, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4442453384399414, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8569245338439941, + "num_tokens": 183617974.0, + "step": 4815 + }, + { + "epoch": 0.6126447016918967, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.464240312576294, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8657263517379761, + "num_tokens": 183660985.0, + "step": 4816 + }, + { + "epoch": 0.6127719119704872, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5961246490478516, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8672791719436646, + "num_tokens": 183695969.0, + "step": 4817 + }, + { + "epoch": 0.6128991222490777, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5966827869415283, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8579304218292236, + "num_tokens": 183736004.0, + "step": 4818 + }, + { + "epoch": 0.6130263325276682, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5055476427078247, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8751354217529297, + "num_tokens": 183770967.0, + "step": 4819 + }, + { + "epoch": 0.6131535428062588, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.9036959409713745, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8678643703460693, + "num_tokens": 183806776.0, + "step": 4820 + }, + { + "epoch": 0.6132807530848493, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4569734334945679, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8579919934272766, + "num_tokens": 183849559.0, + "step": 4821 + }, + { + "epoch": 0.6134079633634397, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.3804584741592407, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8671573400497437, + "num_tokens": 183897547.0, + "step": 4822 + }, + { + "epoch": 0.6135351736420303, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.680880069732666, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8618039488792419, + "num_tokens": 183933889.0, + "step": 4823 + }, + { + "epoch": 0.6136623839206208, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.370556116104126, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8742908239364624, + "num_tokens": 183977168.0, + "step": 4824 + }, + { + "epoch": 0.6137895941992113, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5026655197143555, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8834414482116699, + "num_tokens": 184014371.0, + "step": 4825 + }, + { + "epoch": 0.6139168044778018, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.677118182182312, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8565322160720825, + "num_tokens": 184049626.0, + "step": 4826 + }, + { + "epoch": 0.6140440147563924, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6652557849884033, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8644934892654419, + "num_tokens": 184082308.0, + "step": 4827 + }, + { + "epoch": 0.6141712250349828, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.477121114730835, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8720147013664246, + "num_tokens": 184124580.0, + "step": 4828 + }, + { + "epoch": 0.6142984353135733, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5499305725097656, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8574161529541016, + "num_tokens": 184165069.0, + "step": 4829 + }, + { + "epoch": 0.6144256455921638, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5144901275634766, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8734402656555176, + "num_tokens": 184200763.0, + "step": 4830 + }, + { + "epoch": 0.6145528558707544, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6257253885269165, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8532906770706177, + "num_tokens": 184238628.0, + "step": 4831 + }, + { + "epoch": 0.6146800661493449, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6240825653076172, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8790298104286194, + "num_tokens": 184268238.0, + "step": 4832 + }, + { + "epoch": 0.6148072764279354, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5144282579421997, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8628820180892944, + "num_tokens": 184308805.0, + "step": 4833 + }, + { + "epoch": 0.6149344867065258, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5100456476211548, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8653027415275574, + "num_tokens": 184350501.0, + "step": 4834 + }, + { + "epoch": 0.6150616969851164, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5019476413726807, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8591852784156799, + "num_tokens": 184395462.0, + "step": 4835 + }, + { + "epoch": 0.6151889072637069, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5014487504959106, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8705987930297852, + "num_tokens": 184434895.0, + "step": 4836 + }, + { + "epoch": 0.6153161175422974, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.528579831123352, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8616670370101929, + "num_tokens": 184474190.0, + "step": 4837 + }, + { + "epoch": 0.615443327820888, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6167429685592651, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8637269735336304, + "num_tokens": 184508779.0, + "step": 4838 + }, + { + "epoch": 0.6155705380994785, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4435477256774902, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8658714294433594, + "num_tokens": 184553023.0, + "step": 4839 + }, + { + "epoch": 0.6156977483780689, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6808708906173706, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8630346059799194, + "num_tokens": 184586732.0, + "step": 4840 + }, + { + "epoch": 0.6158249586566594, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5247186422348022, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8688765168190002, + "num_tokens": 184632020.0, + "step": 4841 + }, + { + "epoch": 0.61595216893525, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6729735136032104, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8496941328048706, + "num_tokens": 184670062.0, + "step": 4842 + }, + { + "epoch": 0.6160793792138405, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6720645427703857, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8749315142631531, + "num_tokens": 184706850.0, + "step": 4843 + }, + { + "epoch": 0.616206589492431, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6047779321670532, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8727988004684448, + "num_tokens": 184741739.0, + "step": 4844 + }, + { + "epoch": 0.6163337997710215, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5309385061264038, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8696134686470032, + "num_tokens": 184780417.0, + "step": 4845 + }, + { + "epoch": 0.616461010049612, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.564928650856018, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8730907440185547, + "num_tokens": 184817131.0, + "step": 4846 + }, + { + "epoch": 0.6165882203282025, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5691384077072144, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8775416016578674, + "num_tokens": 184855544.0, + "step": 4847 + }, + { + "epoch": 0.616715430606793, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6270076036453247, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8635425567626953, + "num_tokens": 184893854.0, + "step": 4848 + }, + { + "epoch": 0.6168426408853835, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6407370567321777, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8618146181106567, + "num_tokens": 184927570.0, + "step": 4849 + }, + { + "epoch": 0.6169698511639741, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5909632444381714, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8740026950836182, + "num_tokens": 184965097.0, + "step": 4850 + }, + { + "epoch": 0.6170970614425646, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8633102178573608, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8731842041015625, + "num_tokens": 184998075.0, + "step": 4851 + }, + { + "epoch": 0.617224271721155, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7460196018218994, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.843264639377594, + "num_tokens": 185035820.0, + "step": 4852 + }, + { + "epoch": 0.6173514819997455, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6700700521469116, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8642133474349976, + "num_tokens": 185076814.0, + "step": 4853 + }, + { + "epoch": 0.6174786922783361, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4670615196228027, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8747272491455078, + "num_tokens": 185120610.0, + "step": 4854 + }, + { + "epoch": 0.6176059025569266, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5416046380996704, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.863795280456543, + "num_tokens": 185159682.0, + "step": 4855 + }, + { + "epoch": 0.6177331128355171, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.528311014175415, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.862623929977417, + "num_tokens": 185198450.0, + "step": 4856 + }, + { + "epoch": 0.6178603231141077, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7084206342697144, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8707482814788818, + "num_tokens": 185236844.0, + "step": 4857 + }, + { + "epoch": 0.6179875333926982, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.634108066558838, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8610296249389648, + "num_tokens": 185271786.0, + "step": 4858 + }, + { + "epoch": 0.6181147436712886, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6273322105407715, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.871793806552887, + "num_tokens": 185310497.0, + "step": 4859 + }, + { + "epoch": 0.6182419539498791, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6109552383422852, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8698322772979736, + "num_tokens": 185345147.0, + "step": 4860 + }, + { + "epoch": 0.6183691642284697, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6833627223968506, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8621343970298767, + "num_tokens": 185382730.0, + "step": 4861 + }, + { + "epoch": 0.6184963745070602, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6497714519500732, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8530418872833252, + "num_tokens": 185422007.0, + "step": 4862 + }, + { + "epoch": 0.6186235847856507, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6206386089324951, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8729573488235474, + "num_tokens": 185454472.0, + "step": 4863 + }, + { + "epoch": 0.6187507950642412, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4401533603668213, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8792418837547302, + "num_tokens": 185497761.0, + "step": 4864 + }, + { + "epoch": 0.6188780053428317, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7108513116836548, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8676446676254272, + "num_tokens": 185530623.0, + "step": 4865 + }, + { + "epoch": 0.6190052156214222, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7621465921401978, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8639302253723145, + "num_tokens": 185563246.0, + "step": 4866 + }, + { + "epoch": 0.6191324259000127, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.519072413444519, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8752192854881287, + "num_tokens": 185604885.0, + "step": 4867 + }, + { + "epoch": 0.6192596361786032, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.704860806465149, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8595036864280701, + "num_tokens": 185639225.0, + "step": 4868 + }, + { + "epoch": 0.6193868464571938, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6057217121124268, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8599891066551208, + "num_tokens": 185676231.0, + "step": 4869 + }, + { + "epoch": 0.6195140567357843, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5572361946105957, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8662405014038086, + "num_tokens": 185714772.0, + "step": 4870 + }, + { + "epoch": 0.6196412670143747, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.720332145690918, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8777463436126709, + "num_tokens": 185748451.0, + "step": 4871 + }, + { + "epoch": 0.6197684772929652, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.565315842628479, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8668645024299622, + "num_tokens": 185785014.0, + "step": 4872 + }, + { + "epoch": 0.6198956875715558, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.587669849395752, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8727405071258545, + "num_tokens": 185822426.0, + "step": 4873 + }, + { + "epoch": 0.6200228978501463, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5228216648101807, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8771951198577881, + "num_tokens": 185860534.0, + "step": 4874 + }, + { + "epoch": 0.6201501081287368, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.609246850013733, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8651776909828186, + "num_tokens": 185899635.0, + "step": 4875 + }, + { + "epoch": 0.6202773184073274, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5901812314987183, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8655023574829102, + "num_tokens": 185939480.0, + "step": 4876 + }, + { + "epoch": 0.6204045286859178, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.589768648147583, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8588532209396362, + "num_tokens": 185976002.0, + "step": 4877 + }, + { + "epoch": 0.6205317389645083, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5133178234100342, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8586617708206177, + "num_tokens": 186017296.0, + "step": 4878 + }, + { + "epoch": 0.6206589492430988, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4817091226577759, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8612102270126343, + "num_tokens": 186064645.0, + "step": 4879 + }, + { + "epoch": 0.6207861595216894, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6741368770599365, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8658849596977234, + "num_tokens": 186097899.0, + "step": 4880 + }, + { + "epoch": 0.6209133698002799, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5953059196472168, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8694790601730347, + "num_tokens": 186132510.0, + "step": 4881 + }, + { + "epoch": 0.6210405800788704, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 4.100467681884766, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8519506454467773, + "num_tokens": 186169042.0, + "step": 4882 + }, + { + "epoch": 0.6211677903574608, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.621826410293579, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8580796718597412, + "num_tokens": 186206054.0, + "step": 4883 + }, + { + "epoch": 0.6212950006360514, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7779192924499512, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.852278470993042, + "num_tokens": 186236909.0, + "step": 4884 + }, + { + "epoch": 0.6214222109146419, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4544334411621094, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8737587928771973, + "num_tokens": 186280859.0, + "step": 4885 + }, + { + "epoch": 0.6215494211932324, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.57987642288208, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8574646711349487, + "num_tokens": 186321639.0, + "step": 4886 + }, + { + "epoch": 0.621676631471823, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6073726415634155, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8679944276809692, + "num_tokens": 186359160.0, + "step": 4887 + }, + { + "epoch": 0.6218038417504135, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6253410577774048, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8647818565368652, + "num_tokens": 186394175.0, + "step": 4888 + }, + { + "epoch": 0.6219310520290039, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5341098308563232, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8518354296684265, + "num_tokens": 186437640.0, + "step": 4889 + }, + { + "epoch": 0.6220582623075944, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.617286205291748, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.856769323348999, + "num_tokens": 186473730.0, + "step": 4890 + }, + { + "epoch": 0.622185472586185, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.592146396636963, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8669618368148804, + "num_tokens": 186508708.0, + "step": 4891 + }, + { + "epoch": 0.6223126828647755, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5579640865325928, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8624341487884521, + "num_tokens": 186550623.0, + "step": 4892 + }, + { + "epoch": 0.622439893143366, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5920767784118652, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8709837198257446, + "num_tokens": 186586915.0, + "step": 4893 + }, + { + "epoch": 0.6225671034219565, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6221997737884521, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8453133702278137, + "num_tokens": 186624447.0, + "step": 4894 + }, + { + "epoch": 0.622694313700547, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6112871170043945, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8801352381706238, + "num_tokens": 186660380.0, + "step": 4895 + }, + { + "epoch": 0.6228215239791375, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.715512990951538, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8450338840484619, + "num_tokens": 186699353.0, + "step": 4896 + }, + { + "epoch": 0.622948734257728, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5994609594345093, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8694137334823608, + "num_tokens": 186736742.0, + "step": 4897 + }, + { + "epoch": 0.6230759445363185, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7142367362976074, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8598539233207703, + "num_tokens": 186769620.0, + "step": 4898 + }, + { + "epoch": 0.6232031548149091, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.73222017288208, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8608009815216064, + "num_tokens": 186803175.0, + "step": 4899 + }, + { + "epoch": 0.6233303650934996, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6597912311553955, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8486860990524292, + "num_tokens": 186842087.0, + "step": 4900 + }, + { + "epoch": 0.62345757537209, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4256850481033325, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8747490644454956, + "num_tokens": 186883669.0, + "step": 4901 + }, + { + "epoch": 0.6235847856506805, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6255073547363281, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8676149845123291, + "num_tokens": 186920182.0, + "step": 4902 + }, + { + "epoch": 0.6237119959292711, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.579207420349121, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8515607118606567, + "num_tokens": 186960240.0, + "step": 4903 + }, + { + "epoch": 0.6238392062078616, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.563073754310608, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8811774849891663, + "num_tokens": 187000037.0, + "step": 4904 + }, + { + "epoch": 0.6239664164864521, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7203813791275024, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8451352715492249, + "num_tokens": 187031813.0, + "step": 4905 + }, + { + "epoch": 0.6240936267650427, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5986860990524292, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8786088228225708, + "num_tokens": 187067503.0, + "step": 4906 + }, + { + "epoch": 0.6242208370436332, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4419169425964355, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8639824390411377, + "num_tokens": 187109879.0, + "step": 4907 + }, + { + "epoch": 0.6243480473222236, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.643850326538086, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8758958578109741, + "num_tokens": 187141963.0, + "step": 4908 + }, + { + "epoch": 0.6244752576008141, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.536905288696289, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8663392066955566, + "num_tokens": 187183813.0, + "step": 4909 + }, + { + "epoch": 0.6246024678794047, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5917973518371582, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8685062527656555, + "num_tokens": 187220831.0, + "step": 4910 + }, + { + "epoch": 0.6247296781579952, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5727009773254395, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8522888422012329, + "num_tokens": 187259738.0, + "step": 4911 + }, + { + "epoch": 0.6248568884365857, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5476399660110474, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8514816761016846, + "num_tokens": 187303578.0, + "step": 4912 + }, + { + "epoch": 0.6249840987151762, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5810914039611816, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.866139829158783, + "num_tokens": 187344021.0, + "step": 4913 + }, + { + "epoch": 0.6251113089937667, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8220915794372559, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8677204847335815, + "num_tokens": 187375101.0, + "step": 4914 + }, + { + "epoch": 0.6252385192723572, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6164937019348145, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8536626696586609, + "num_tokens": 187412325.0, + "step": 4915 + }, + { + "epoch": 0.6253657295509477, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5931793451309204, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8634403347969055, + "num_tokens": 187451886.0, + "step": 4916 + }, + { + "epoch": 0.6254929398295382, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6319310665130615, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8664711713790894, + "num_tokens": 187488840.0, + "step": 4917 + }, + { + "epoch": 0.6256201501081288, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4749321937561035, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8759733438491821, + "num_tokens": 187530253.0, + "step": 4918 + }, + { + "epoch": 0.6257473603867193, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7269257307052612, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8668928146362305, + "num_tokens": 187562143.0, + "step": 4919 + }, + { + "epoch": 0.6258745706653097, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.642684817314148, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8635435104370117, + "num_tokens": 187599768.0, + "step": 4920 + }, + { + "epoch": 0.6260017809439002, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5335038900375366, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8702957630157471, + "num_tokens": 187637485.0, + "step": 4921 + }, + { + "epoch": 0.6261289912224908, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5187045335769653, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8703683614730835, + "num_tokens": 187675048.0, + "step": 4922 + }, + { + "epoch": 0.6262562015010813, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4348951578140259, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8661498427391052, + "num_tokens": 187720096.0, + "step": 4923 + }, + { + "epoch": 0.6263834117796718, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6084800958633423, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8606389164924622, + "num_tokens": 187761354.0, + "step": 4924 + }, + { + "epoch": 0.6265106220582624, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6373873949050903, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8612536191940308, + "num_tokens": 187796318.0, + "step": 4925 + }, + { + "epoch": 0.6266378323368528, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5980792045593262, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.866513729095459, + "num_tokens": 187831985.0, + "step": 4926 + }, + { + "epoch": 0.6267650426154433, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8615949153900146, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8515673279762268, + "num_tokens": 187866586.0, + "step": 4927 + }, + { + "epoch": 0.6268922528940338, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5834596157073975, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8544139862060547, + "num_tokens": 187912023.0, + "step": 4928 + }, + { + "epoch": 0.6270194631726244, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.655050277709961, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8534689545631409, + "num_tokens": 187947766.0, + "step": 4929 + }, + { + "epoch": 0.6271466734512149, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.768117070198059, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8534462451934814, + "num_tokens": 187982376.0, + "step": 4930 + }, + { + "epoch": 0.6272738837298054, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6312613487243652, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8612838983535767, + "num_tokens": 188021139.0, + "step": 4931 + }, + { + "epoch": 0.6274010940083958, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.599333643913269, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8722365498542786, + "num_tokens": 188056693.0, + "step": 4932 + }, + { + "epoch": 0.6275283042869864, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6141761541366577, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8697726726531982, + "num_tokens": 188096116.0, + "step": 4933 + }, + { + "epoch": 0.6276555145655769, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6264389753341675, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8709651231765747, + "num_tokens": 188133785.0, + "step": 4934 + }, + { + "epoch": 0.6277827248441674, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.617173433303833, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8507831692695618, + "num_tokens": 188171171.0, + "step": 4935 + }, + { + "epoch": 0.627909935122758, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 3.685318946838379, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8605527877807617, + "num_tokens": 188207716.0, + "step": 4936 + }, + { + "epoch": 0.6280371454013485, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5338919162750244, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8499429225921631, + "num_tokens": 188248709.0, + "step": 4937 + }, + { + "epoch": 0.6281643556799389, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7214845418930054, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.86626136302948, + "num_tokens": 188283656.0, + "step": 4938 + }, + { + "epoch": 0.6282915659585294, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.582398533821106, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8654218912124634, + "num_tokens": 188324566.0, + "step": 4939 + }, + { + "epoch": 0.62841877623712, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5098391771316528, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8677355647087097, + "num_tokens": 188366781.0, + "step": 4940 + }, + { + "epoch": 0.6285459865157105, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.581430196762085, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.872268557548523, + "num_tokens": 188407212.0, + "step": 4941 + }, + { + "epoch": 0.628673196794301, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5819624662399292, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8674106001853943, + "num_tokens": 188447645.0, + "step": 4942 + }, + { + "epoch": 0.6288004070728915, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6327351331710815, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.868889570236206, + "num_tokens": 188485438.0, + "step": 4943 + }, + { + "epoch": 0.628927617351482, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5105335712432861, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8597303032875061, + "num_tokens": 188528885.0, + "step": 4944 + }, + { + "epoch": 0.6290548276300725, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7441060543060303, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.858593761920929, + "num_tokens": 188565386.0, + "step": 4945 + }, + { + "epoch": 0.629182037908663, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4909645318984985, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8786321878433228, + "num_tokens": 188606371.0, + "step": 4946 + }, + { + "epoch": 0.6293092481872535, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5140851736068726, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8712730407714844, + "num_tokens": 188644395.0, + "step": 4947 + }, + { + "epoch": 0.6294364584658441, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.763637900352478, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8740121722221375, + "num_tokens": 188674256.0, + "step": 4948 + }, + { + "epoch": 0.6295636687444346, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.719760775566101, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8653151988983154, + "num_tokens": 188711205.0, + "step": 4949 + }, + { + "epoch": 0.629690879023025, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6887747049331665, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8709532022476196, + "num_tokens": 188748270.0, + "step": 4950 + }, + { + "epoch": 0.6298180893016155, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7402292490005493, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8617108464241028, + "num_tokens": 188781888.0, + "step": 4951 + }, + { + "epoch": 0.6299452995802061, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5778920650482178, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8685402870178223, + "num_tokens": 188819708.0, + "step": 4952 + }, + { + "epoch": 0.6300725098587966, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5465816259384155, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8541142344474792, + "num_tokens": 188860622.0, + "step": 4953 + }, + { + "epoch": 0.6301997201373871, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.585830807685852, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8636479377746582, + "num_tokens": 188899321.0, + "step": 4954 + }, + { + "epoch": 0.6303269304159776, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6266255378723145, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.862720251083374, + "num_tokens": 188937545.0, + "step": 4955 + }, + { + "epoch": 0.6304541406945681, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.639716386795044, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8715029954910278, + "num_tokens": 188973658.0, + "step": 4956 + }, + { + "epoch": 0.6305813509731586, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5124092102050781, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8489408493041992, + "num_tokens": 189017478.0, + "step": 4957 + }, + { + "epoch": 0.6307085612517491, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5461012125015259, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8541699051856995, + "num_tokens": 189060604.0, + "step": 4958 + }, + { + "epoch": 0.6308357715303397, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7305020093917847, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.861037015914917, + "num_tokens": 189095615.0, + "step": 4959 + }, + { + "epoch": 0.6309629818089302, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5147587060928345, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8674482107162476, + "num_tokens": 189137894.0, + "step": 4960 + }, + { + "epoch": 0.6310901920875207, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4335927963256836, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.866751492023468, + "num_tokens": 189184685.0, + "step": 4961 + }, + { + "epoch": 0.6312174023661112, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5667837858200073, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8734697103500366, + "num_tokens": 189225350.0, + "step": 4962 + }, + { + "epoch": 0.6313446126447017, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6228928565979004, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8621053695678711, + "num_tokens": 189262391.0, + "step": 4963 + }, + { + "epoch": 0.6314718229232922, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6505489349365234, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8526089191436768, + "num_tokens": 189299936.0, + "step": 4964 + }, + { + "epoch": 0.6315990332018827, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8203914165496826, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8377385139465332, + "num_tokens": 189336887.0, + "step": 4965 + }, + { + "epoch": 0.6317262434804732, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7715013027191162, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.863294780254364, + "num_tokens": 189372950.0, + "step": 4966 + }, + { + "epoch": 0.6318534537590638, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.613404631614685, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8570556640625, + "num_tokens": 189412163.0, + "step": 4967 + }, + { + "epoch": 0.6319806640376543, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8178898096084595, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8423804044723511, + "num_tokens": 189448514.0, + "step": 4968 + }, + { + "epoch": 0.6321078743162447, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4570764303207397, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.870330810546875, + "num_tokens": 189492224.0, + "step": 4969 + }, + { + "epoch": 0.6322350845948352, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.58107328414917, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8516771197319031, + "num_tokens": 189532981.0, + "step": 4970 + }, + { + "epoch": 0.6323622948734258, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6322864294052124, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8665594458580017, + "num_tokens": 189568486.0, + "step": 4971 + }, + { + "epoch": 0.6324895051520163, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7232859134674072, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.857772707939148, + "num_tokens": 189603860.0, + "step": 4972 + }, + { + "epoch": 0.6326167154306068, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5284148454666138, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8350822329521179, + "num_tokens": 189646468.0, + "step": 4973 + }, + { + "epoch": 0.6327439257091974, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6175113916397095, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8608621954917908, + "num_tokens": 189684092.0, + "step": 4974 + }, + { + "epoch": 0.6328711359877878, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6801869869232178, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8560826778411865, + "num_tokens": 189718276.0, + "step": 4975 + }, + { + "epoch": 0.6329983462663783, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7020604610443115, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8710357546806335, + "num_tokens": 189757774.0, + "step": 4976 + }, + { + "epoch": 0.6331255565449688, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.576062560081482, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8660100698471069, + "num_tokens": 189796407.0, + "step": 4977 + }, + { + "epoch": 0.6332527668235594, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6567959785461426, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.872075080871582, + "num_tokens": 189834500.0, + "step": 4978 + }, + { + "epoch": 0.6333799771021499, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5533719062805176, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8688735961914062, + "num_tokens": 189872487.0, + "step": 4979 + }, + { + "epoch": 0.6335071873807404, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6602911949157715, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8640552163124084, + "num_tokens": 189907150.0, + "step": 4980 + }, + { + "epoch": 0.6336343976593308, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6311215162277222, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8836559057235718, + "num_tokens": 189942348.0, + "step": 4981 + }, + { + "epoch": 0.6337616079379214, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.600915789604187, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8679205775260925, + "num_tokens": 189980049.0, + "step": 4982 + }, + { + "epoch": 0.6338888182165119, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6556130647659302, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8728556632995605, + "num_tokens": 190016773.0, + "step": 4983 + }, + { + "epoch": 0.6340160284951024, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5368552207946777, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8712345361709595, + "num_tokens": 190053900.0, + "step": 4984 + }, + { + "epoch": 0.6341432387736929, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.3797537088394165, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8758257627487183, + "num_tokens": 190098446.0, + "step": 4985 + }, + { + "epoch": 0.6342704490522835, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8157674074172974, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.864716649055481, + "num_tokens": 190130413.0, + "step": 4986 + }, + { + "epoch": 0.6343976593308739, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7424417734146118, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8565613031387329, + "num_tokens": 190165065.0, + "step": 4987 + }, + { + "epoch": 0.6345248696094644, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.53590989112854, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.872711181640625, + "num_tokens": 190203382.0, + "step": 4988 + }, + { + "epoch": 0.6346520798880549, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5186480283737183, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8651285171508789, + "num_tokens": 190243365.0, + "step": 4989 + }, + { + "epoch": 0.6347792901666455, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5631507635116577, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8568505644798279, + "num_tokens": 190282155.0, + "step": 4990 + }, + { + "epoch": 0.634906500445236, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.520810604095459, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8739831447601318, + "num_tokens": 190318468.0, + "step": 4991 + }, + { + "epoch": 0.6350337107238265, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.538408875465393, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8789626359939575, + "num_tokens": 190357892.0, + "step": 4992 + }, + { + "epoch": 0.635160921002417, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6141343116760254, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8609892129898071, + "num_tokens": 190394875.0, + "step": 4993 + }, + { + "epoch": 0.6352881312810075, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.649654746055603, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8649506568908691, + "num_tokens": 190428606.0, + "step": 4994 + }, + { + "epoch": 0.635415341559598, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7789162397384644, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.852607011795044, + "num_tokens": 190466314.0, + "step": 4995 + }, + { + "epoch": 0.6355425518381885, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6877248287200928, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8761050701141357, + "num_tokens": 190498201.0, + "step": 4996 + }, + { + "epoch": 0.6356697621167791, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7247540950775146, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8445766568183899, + "num_tokens": 190535913.0, + "step": 4997 + }, + { + "epoch": 0.6357969723953696, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.730059266090393, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8566848039627075, + "num_tokens": 190571772.0, + "step": 4998 + }, + { + "epoch": 0.63592418267396, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.524151086807251, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8764685392379761, + "num_tokens": 190612161.0, + "step": 4999 + }, + { + "epoch": 0.6360513929525505, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6090694665908813, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.850588321685791, + "num_tokens": 190652216.0, + "step": 5000 + }, + { + "epoch": 0.6361786032311411, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5373313426971436, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8687640428543091, + "num_tokens": 190689466.0, + "step": 5001 + }, + { + "epoch": 0.6363058135097316, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6477679014205933, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8649308681488037, + "num_tokens": 190727216.0, + "step": 5002 + }, + { + "epoch": 0.6364330237883221, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7269340753555298, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8584015369415283, + "num_tokens": 190767534.0, + "step": 5003 + }, + { + "epoch": 0.6365602340669126, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.668697476387024, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8488677144050598, + "num_tokens": 190807046.0, + "step": 5004 + }, + { + "epoch": 0.6366874443455031, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5636907815933228, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8627312183380127, + "num_tokens": 190846849.0, + "step": 5005 + }, + { + "epoch": 0.6368146546240936, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5699230432510376, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8671602010726929, + "num_tokens": 190889219.0, + "step": 5006 + }, + { + "epoch": 0.6369418649026841, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6564894914627075, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8640086650848389, + "num_tokens": 190931497.0, + "step": 5007 + }, + { + "epoch": 0.6370690751812746, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8218406438827515, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8574938774108887, + "num_tokens": 190963178.0, + "step": 5008 + }, + { + "epoch": 0.6371962854598652, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4930521249771118, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8573248386383057, + "num_tokens": 191005639.0, + "step": 5009 + }, + { + "epoch": 0.6373234957384557, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7159028053283691, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.857337474822998, + "num_tokens": 191039935.0, + "step": 5010 + }, + { + "epoch": 0.6374507060170462, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.596980333328247, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8552070260047913, + "num_tokens": 191077855.0, + "step": 5011 + }, + { + "epoch": 0.6375779162956366, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7690582275390625, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8593889474868774, + "num_tokens": 191111631.0, + "step": 5012 + }, + { + "epoch": 0.6377051265742272, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7852983474731445, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8789416551589966, + "num_tokens": 191142958.0, + "step": 5013 + }, + { + "epoch": 0.6378323368528177, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6089993715286255, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.863301157951355, + "num_tokens": 191180305.0, + "step": 5014 + }, + { + "epoch": 0.6379595471314082, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.482459545135498, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.862896203994751, + "num_tokens": 191220260.0, + "step": 5015 + }, + { + "epoch": 0.6380867574099988, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6654695272445679, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8564144372940063, + "num_tokens": 191259226.0, + "step": 5016 + }, + { + "epoch": 0.6382139676885893, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5035184621810913, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8625730276107788, + "num_tokens": 191299338.0, + "step": 5017 + }, + { + "epoch": 0.6383411779671797, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5714313983917236, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8571582436561584, + "num_tokens": 191334696.0, + "step": 5018 + }, + { + "epoch": 0.6384683882457702, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5055276155471802, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8648293614387512, + "num_tokens": 191376215.0, + "step": 5019 + }, + { + "epoch": 0.6385955985243608, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5893232822418213, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.874698281288147, + "num_tokens": 191411792.0, + "step": 5020 + }, + { + "epoch": 0.6387228088029513, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5003330707550049, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8700623512268066, + "num_tokens": 191449917.0, + "step": 5021 + }, + { + "epoch": 0.6388500190815418, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5907648801803589, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8530645966529846, + "num_tokens": 191488437.0, + "step": 5022 + }, + { + "epoch": 0.6389772293601323, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.624008059501648, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8655951023101807, + "num_tokens": 191524260.0, + "step": 5023 + }, + { + "epoch": 0.6391044396387228, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5549921989440918, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8605344295501709, + "num_tokens": 191564356.0, + "step": 5024 + }, + { + "epoch": 0.6392316499173133, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5332999229431152, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8605469465255737, + "num_tokens": 191603607.0, + "step": 5025 + }, + { + "epoch": 0.6393588601959038, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.467786192893982, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8529090285301208, + "num_tokens": 191649584.0, + "step": 5026 + }, + { + "epoch": 0.6394860704744944, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.520192265510559, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8617925643920898, + "num_tokens": 191689702.0, + "step": 5027 + }, + { + "epoch": 0.6396132807530849, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5162168741226196, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8654246926307678, + "num_tokens": 191730536.0, + "step": 5028 + }, + { + "epoch": 0.6397404910316754, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5123379230499268, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.883612871170044, + "num_tokens": 191767846.0, + "step": 5029 + }, + { + "epoch": 0.6398677013102658, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5384165048599243, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8505735397338867, + "num_tokens": 191810929.0, + "step": 5030 + }, + { + "epoch": 0.6399949115888564, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.762706995010376, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8427611589431763, + "num_tokens": 191845052.0, + "step": 5031 + }, + { + "epoch": 0.6401221218674469, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5733160972595215, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8615070581436157, + "num_tokens": 191884522.0, + "step": 5032 + }, + { + "epoch": 0.6402493321460374, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6973954439163208, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.871654748916626, + "num_tokens": 191917881.0, + "step": 5033 + }, + { + "epoch": 0.6403765424246279, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7021055221557617, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8647637963294983, + "num_tokens": 191953808.0, + "step": 5034 + }, + { + "epoch": 0.6405037527032185, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.618277668952942, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8504998087882996, + "num_tokens": 191995357.0, + "step": 5035 + }, + { + "epoch": 0.6406309629818089, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5988132953643799, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8522598147392273, + "num_tokens": 192034665.0, + "step": 5036 + }, + { + "epoch": 0.6407581732603994, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7363834381103516, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8585915565490723, + "num_tokens": 192068313.0, + "step": 5037 + }, + { + "epoch": 0.6408853835389899, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5090948343276978, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8655340075492859, + "num_tokens": 192109089.0, + "step": 5038 + }, + { + "epoch": 0.6410125938175805, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5123844146728516, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8582924604415894, + "num_tokens": 192149011.0, + "step": 5039 + }, + { + "epoch": 0.641139804096171, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6328632831573486, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8632528781890869, + "num_tokens": 192184717.0, + "step": 5040 + }, + { + "epoch": 0.6412670143747615, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6494284868240356, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8622603416442871, + "num_tokens": 192219760.0, + "step": 5041 + }, + { + "epoch": 0.6413942246533519, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4652411937713623, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8754981756210327, + "num_tokens": 192260966.0, + "step": 5042 + }, + { + "epoch": 0.6415214349319425, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5175001621246338, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8547859787940979, + "num_tokens": 192301899.0, + "step": 5043 + }, + { + "epoch": 0.641648645210533, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6819490194320679, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8687236905097961, + "num_tokens": 192333344.0, + "step": 5044 + }, + { + "epoch": 0.6417758554891235, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.770128846168518, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8648968935012817, + "num_tokens": 192365669.0, + "step": 5045 + }, + { + "epoch": 0.641903065767714, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5197430849075317, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8665542602539062, + "num_tokens": 192407198.0, + "step": 5046 + }, + { + "epoch": 0.6420302760463046, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5933245420455933, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8486159443855286, + "num_tokens": 192444883.0, + "step": 5047 + }, + { + "epoch": 0.642157486324895, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5200358629226685, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8715947866439819, + "num_tokens": 192479785.0, + "step": 5048 + }, + { + "epoch": 0.6422846966034855, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7674614191055298, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8827475309371948, + "num_tokens": 192510534.0, + "step": 5049 + }, + { + "epoch": 0.6424119068820761, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6707160472869873, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8711231350898743, + "num_tokens": 192548996.0, + "step": 5050 + }, + { + "epoch": 0.6425391171606666, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6411314010620117, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8709836006164551, + "num_tokens": 192583909.0, + "step": 5051 + }, + { + "epoch": 0.6426663274392571, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5811518430709839, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8576607704162598, + "num_tokens": 192627979.0, + "step": 5052 + }, + { + "epoch": 0.6427935377178476, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5665494203567505, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8652785420417786, + "num_tokens": 192664062.0, + "step": 5053 + }, + { + "epoch": 0.6429207479964381, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5161480903625488, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8665508031845093, + "num_tokens": 192704362.0, + "step": 5054 + }, + { + "epoch": 0.6430479582750286, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2248756885528564, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8660406470298767, + "num_tokens": 192743237.0, + "step": 5055 + }, + { + "epoch": 0.6431751685536191, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5531688928604126, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8587402105331421, + "num_tokens": 192782106.0, + "step": 5056 + }, + { + "epoch": 0.6433023788322096, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.9035292863845825, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8523079752922058, + "num_tokens": 192809787.0, + "step": 5057 + }, + { + "epoch": 0.6434295891108002, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.768453598022461, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8652913570404053, + "num_tokens": 192842304.0, + "step": 5058 + }, + { + "epoch": 0.6435567993893907, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6219007968902588, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8566896915435791, + "num_tokens": 192881004.0, + "step": 5059 + }, + { + "epoch": 0.6436840096679812, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5799776315689087, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8748815059661865, + "num_tokens": 192914733.0, + "step": 5060 + }, + { + "epoch": 0.6438112199465716, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6220483779907227, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8630268573760986, + "num_tokens": 192954934.0, + "step": 5061 + }, + { + "epoch": 0.6439384302251622, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5383621454238892, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8519399166107178, + "num_tokens": 192998790.0, + "step": 5062 + }, + { + "epoch": 0.6440656405037527, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.649903655052185, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8596726059913635, + "num_tokens": 193032268.0, + "step": 5063 + }, + { + "epoch": 0.6441928507823432, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.3613486289978027, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8809348344802856, + "num_tokens": 193075604.0, + "step": 5064 + }, + { + "epoch": 0.6443200610609338, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5232069492340088, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8736010789871216, + "num_tokens": 193111482.0, + "step": 5065 + }, + { + "epoch": 0.6444472713395243, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4429216384887695, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8887258768081665, + "num_tokens": 193150731.0, + "step": 5066 + }, + { + "epoch": 0.6445744816181147, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5225993394851685, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8732836842536926, + "num_tokens": 193188293.0, + "step": 5067 + }, + { + "epoch": 0.6447016918967052, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6499911546707153, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8659460544586182, + "num_tokens": 193224963.0, + "step": 5068 + }, + { + "epoch": 0.6448289021752958, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.691698431968689, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8481353521347046, + "num_tokens": 193264437.0, + "step": 5069 + }, + { + "epoch": 0.6449561124538863, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6290743350982666, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8607050180435181, + "num_tokens": 193303244.0, + "step": 5070 + }, + { + "epoch": 0.6450833227324768, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4961488246917725, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8699538111686707, + "num_tokens": 193345083.0, + "step": 5071 + }, + { + "epoch": 0.6452105330110673, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.1927335262298584, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.851094663143158, + "num_tokens": 193392144.0, + "step": 5072 + }, + { + "epoch": 0.6453377432896578, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6932556629180908, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8695147633552551, + "num_tokens": 193425972.0, + "step": 5073 + }, + { + "epoch": 0.6454649535682483, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5948779582977295, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8616757392883301, + "num_tokens": 193461439.0, + "step": 5074 + }, + { + "epoch": 0.6455921638468388, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5247009992599487, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.866410493850708, + "num_tokens": 193501833.0, + "step": 5075 + }, + { + "epoch": 0.6457193741254293, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6653327941894531, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8651928901672363, + "num_tokens": 193538321.0, + "step": 5076 + }, + { + "epoch": 0.6458465844040199, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7899080514907837, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8404117226600647, + "num_tokens": 193573260.0, + "step": 5077 + }, + { + "epoch": 0.6459737946826104, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6429520845413208, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8685531616210938, + "num_tokens": 193607876.0, + "step": 5078 + }, + { + "epoch": 0.6461010049612008, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5627954006195068, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8609740734100342, + "num_tokens": 193651237.0, + "step": 5079 + }, + { + "epoch": 0.6462282152397913, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.550820231437683, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8630207777023315, + "num_tokens": 193693779.0, + "step": 5080 + }, + { + "epoch": 0.6463554255183819, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4077672958374023, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8714745044708252, + "num_tokens": 193740431.0, + "step": 5081 + }, + { + "epoch": 0.6464826357969724, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5373892784118652, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8698517084121704, + "num_tokens": 193779980.0, + "step": 5082 + }, + { + "epoch": 0.6466098460755629, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5926365852355957, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8540917634963989, + "num_tokens": 193816898.0, + "step": 5083 + }, + { + "epoch": 0.6467370563541535, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5126237869262695, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8787370920181274, + "num_tokens": 193857775.0, + "step": 5084 + }, + { + "epoch": 0.6468642666327439, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.555922508239746, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8643389940261841, + "num_tokens": 193897299.0, + "step": 5085 + }, + { + "epoch": 0.6469914769113344, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4587960243225098, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8700175285339355, + "num_tokens": 193941495.0, + "step": 5086 + }, + { + "epoch": 0.6471186871899249, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5707858800888062, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8682427406311035, + "num_tokens": 193978132.0, + "step": 5087 + }, + { + "epoch": 0.6472458974685155, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7421859502792358, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8635283708572388, + "num_tokens": 194008940.0, + "step": 5088 + }, + { + "epoch": 0.647373107747106, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.57566499710083, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8550335168838501, + "num_tokens": 194048924.0, + "step": 5089 + }, + { + "epoch": 0.6475003180256965, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.669335126876831, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8632424473762512, + "num_tokens": 194080082.0, + "step": 5090 + }, + { + "epoch": 0.6476275283042869, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6054714918136597, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8565559387207031, + "num_tokens": 194118018.0, + "step": 5091 + }, + { + "epoch": 0.6477547385828775, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4545897245407104, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8708240389823914, + "num_tokens": 194158472.0, + "step": 5092 + }, + { + "epoch": 0.647881948861468, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.641618251800537, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.847569465637207, + "num_tokens": 194194338.0, + "step": 5093 + }, + { + "epoch": 0.6480091591400585, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5911338329315186, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8683621287345886, + "num_tokens": 194229929.0, + "step": 5094 + }, + { + "epoch": 0.648136369418649, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6845802068710327, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.862257719039917, + "num_tokens": 194264031.0, + "step": 5095 + }, + { + "epoch": 0.6482635796972396, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.492885947227478, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8782515525817871, + "num_tokens": 194300987.0, + "step": 5096 + }, + { + "epoch": 0.64839078997583, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.562990665435791, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8605673313140869, + "num_tokens": 194340635.0, + "step": 5097 + }, + { + "epoch": 0.6485180002544205, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6735122203826904, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8697395324707031, + "num_tokens": 194376855.0, + "step": 5098 + }, + { + "epoch": 0.648645210533011, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5533876419067383, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8595647811889648, + "num_tokens": 194421639.0, + "step": 5099 + }, + { + "epoch": 0.6487724208116016, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.466341257095337, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8583211302757263, + "num_tokens": 194467142.0, + "step": 5100 + }, + { + "epoch": 0.6488996310901921, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7855098247528076, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8520033359527588, + "num_tokens": 194497273.0, + "step": 5101 + }, + { + "epoch": 0.6490268413687826, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5888530015945435, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8652356266975403, + "num_tokens": 194533278.0, + "step": 5102 + }, + { + "epoch": 0.649154051647373, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.488499641418457, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8678204417228699, + "num_tokens": 194580600.0, + "step": 5103 + }, + { + "epoch": 0.6492812619259636, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.733979344367981, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8462144732475281, + "num_tokens": 194620386.0, + "step": 5104 + }, + { + "epoch": 0.6494084722045541, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5700753927230835, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8646551966667175, + "num_tokens": 194655595.0, + "step": 5105 + }, + { + "epoch": 0.6495356824831446, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4893494844436646, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8696068525314331, + "num_tokens": 194697260.0, + "step": 5106 + }, + { + "epoch": 0.6496628927617352, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7396385669708252, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8530302047729492, + "num_tokens": 194730339.0, + "step": 5107 + }, + { + "epoch": 0.6497901030403257, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4400001764297485, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8782566785812378, + "num_tokens": 194768304.0, + "step": 5108 + }, + { + "epoch": 0.6499173133189162, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5157265663146973, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8784002065658569, + "num_tokens": 194806185.0, + "step": 5109 + }, + { + "epoch": 0.6500445235975066, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5438542366027832, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8568564057350159, + "num_tokens": 194844061.0, + "step": 5110 + }, + { + "epoch": 0.6501717338760972, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 2.2716116905212402, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8525983095169067, + "num_tokens": 194877467.0, + "step": 5111 + }, + { + "epoch": 0.6502989441546877, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.595883846282959, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8581396341323853, + "num_tokens": 194914130.0, + "step": 5112 + }, + { + "epoch": 0.6504261544332782, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4840612411499023, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8623273372650146, + "num_tokens": 194956447.0, + "step": 5113 + }, + { + "epoch": 0.6505533647118688, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.651003360748291, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8555889129638672, + "num_tokens": 194992375.0, + "step": 5114 + }, + { + "epoch": 0.6506805749904593, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5307207107543945, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.855218768119812, + "num_tokens": 195031856.0, + "step": 5115 + }, + { + "epoch": 0.6508077852690497, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7423001527786255, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8625412583351135, + "num_tokens": 195070661.0, + "step": 5116 + }, + { + "epoch": 0.6509349955476402, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7363842725753784, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8585858941078186, + "num_tokens": 195103564.0, + "step": 5117 + }, + { + "epoch": 0.6510622058262308, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.686156153678894, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.854516863822937, + "num_tokens": 195140484.0, + "step": 5118 + }, + { + "epoch": 0.6511894161048213, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5143427848815918, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8759044408798218, + "num_tokens": 195178723.0, + "step": 5119 + }, + { + "epoch": 0.6513166263834118, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7153444290161133, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8645804524421692, + "num_tokens": 195210628.0, + "step": 5120 + }, + { + "epoch": 0.6514438366620023, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5539255142211914, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8678548336029053, + "num_tokens": 195252062.0, + "step": 5121 + }, + { + "epoch": 0.6515710469405928, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.578906536102295, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8581573367118835, + "num_tokens": 195290740.0, + "step": 5122 + }, + { + "epoch": 0.6516982572191833, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6403005123138428, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8653161525726318, + "num_tokens": 195327564.0, + "step": 5123 + }, + { + "epoch": 0.6518254674977738, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4314439296722412, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.872115969657898, + "num_tokens": 195370927.0, + "step": 5124 + }, + { + "epoch": 0.6519526777763643, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5305687189102173, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8540215492248535, + "num_tokens": 195413732.0, + "step": 5125 + }, + { + "epoch": 0.6520798880549549, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6218361854553223, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8628618717193604, + "num_tokens": 195450185.0, + "step": 5126 + }, + { + "epoch": 0.6522070983335454, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6805611848831177, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8674386739730835, + "num_tokens": 195488394.0, + "step": 5127 + }, + { + "epoch": 0.6523343086121358, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.498265027999878, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8663655519485474, + "num_tokens": 195527810.0, + "step": 5128 + }, + { + "epoch": 0.6524615188907263, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6011933088302612, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8524711728096008, + "num_tokens": 195566669.0, + "step": 5129 + }, + { + "epoch": 0.6525887291693169, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.575895071029663, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8594675064086914, + "num_tokens": 195603805.0, + "step": 5130 + }, + { + "epoch": 0.6527159394479074, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.623426914215088, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.871295690536499, + "num_tokens": 195638257.0, + "step": 5131 + }, + { + "epoch": 0.6528431497264979, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6541543006896973, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8543932437896729, + "num_tokens": 195675782.0, + "step": 5132 + }, + { + "epoch": 0.6529703600050885, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.536055564880371, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8735090494155884, + "num_tokens": 195712069.0, + "step": 5133 + }, + { + "epoch": 0.6530975702836789, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.63048255443573, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8503700494766235, + "num_tokens": 195748449.0, + "step": 5134 + }, + { + "epoch": 0.6532247805622694, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.666452407836914, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8642534613609314, + "num_tokens": 195782518.0, + "step": 5135 + }, + { + "epoch": 0.6533519908408599, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6043753623962402, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8673127889633179, + "num_tokens": 195820843.0, + "step": 5136 + }, + { + "epoch": 0.6534792011194505, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6761047840118408, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8433259129524231, + "num_tokens": 195860215.0, + "step": 5137 + }, + { + "epoch": 0.653606411398041, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.563415765762329, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8797240257263184, + "num_tokens": 195903437.0, + "step": 5138 + }, + { + "epoch": 0.6537336216766315, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6769812107086182, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8551428318023682, + "num_tokens": 195940787.0, + "step": 5139 + }, + { + "epoch": 0.6538608319552219, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7348841428756714, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8542031645774841, + "num_tokens": 195976076.0, + "step": 5140 + }, + { + "epoch": 0.6539880422338125, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4728530645370483, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8668668866157532, + "num_tokens": 196019260.0, + "step": 5141 + }, + { + "epoch": 0.654115252512403, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6093977689743042, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8613633513450623, + "num_tokens": 196056005.0, + "step": 5142 + }, + { + "epoch": 0.6542424627909935, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6936309337615967, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.853836715221405, + "num_tokens": 196092345.0, + "step": 5143 + }, + { + "epoch": 0.654369673069584, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.648389220237732, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8568044304847717, + "num_tokens": 196130340.0, + "step": 5144 + }, + { + "epoch": 0.6544968833481746, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6443322896957397, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8539847135543823, + "num_tokens": 196165663.0, + "step": 5145 + }, + { + "epoch": 0.654624093626765, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5319960117340088, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8698364496231079, + "num_tokens": 196206407.0, + "step": 5146 + }, + { + "epoch": 0.6547513039053555, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5686545372009277, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8573727607727051, + "num_tokens": 196246567.0, + "step": 5147 + }, + { + "epoch": 0.654878514183946, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6766091585159302, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8488343358039856, + "num_tokens": 196282017.0, + "step": 5148 + }, + { + "epoch": 0.6550057244625366, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6465907096862793, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8497350215911865, + "num_tokens": 196319846.0, + "step": 5149 + }, + { + "epoch": 0.6551329347411271, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7884767055511475, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8564566969871521, + "num_tokens": 196361256.0, + "step": 5150 + }, + { + "epoch": 0.6552601450197176, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5620379447937012, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8746099472045898, + "num_tokens": 196399412.0, + "step": 5151 + }, + { + "epoch": 0.655387355298308, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7589318752288818, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8655679821968079, + "num_tokens": 196435930.0, + "step": 5152 + }, + { + "epoch": 0.6555145655768986, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7154902219772339, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8599481582641602, + "num_tokens": 196471401.0, + "step": 5153 + }, + { + "epoch": 0.6556417758554891, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.401712417602539, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8746548891067505, + "num_tokens": 196514589.0, + "step": 5154 + }, + { + "epoch": 0.6557689861340796, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.9042640924453735, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8666154146194458, + "num_tokens": 196545790.0, + "step": 5155 + }, + { + "epoch": 0.6558961964126702, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.718627691268921, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8602530360221863, + "num_tokens": 196580566.0, + "step": 5156 + }, + { + "epoch": 0.6560234066912607, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6413666009902954, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8624812364578247, + "num_tokens": 196622604.0, + "step": 5157 + }, + { + "epoch": 0.6561506169698512, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7832039594650269, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8788498640060425, + "num_tokens": 196651678.0, + "step": 5158 + }, + { + "epoch": 0.6562778272484416, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8051323890686035, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8452597856521606, + "num_tokens": 196684977.0, + "step": 5159 + }, + { + "epoch": 0.6564050375270322, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8034006357192993, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8713147640228271, + "num_tokens": 196723772.0, + "step": 5160 + }, + { + "epoch": 0.6565322478056227, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.8116189241409302, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8570730686187744, + "num_tokens": 196754256.0, + "step": 5161 + }, + { + "epoch": 0.6566594580842132, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6859042644500732, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8604244589805603, + "num_tokens": 196791744.0, + "step": 5162 + }, + { + "epoch": 0.6567866683628037, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6094335317611694, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8591805100440979, + "num_tokens": 196827771.0, + "step": 5163 + }, + { + "epoch": 0.6569138786413943, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7372502088546753, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8790689706802368, + "num_tokens": 196858215.0, + "step": 5164 + }, + { + "epoch": 0.6570410889199847, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6530412435531616, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8676815032958984, + "num_tokens": 196893947.0, + "step": 5165 + }, + { + "epoch": 0.6571682991985752, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7443262338638306, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8641471862792969, + "num_tokens": 196923359.0, + "step": 5166 + }, + { + "epoch": 0.6572955094771658, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.502621054649353, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8627227544784546, + "num_tokens": 196968585.0, + "step": 5167 + }, + { + "epoch": 0.6574227197557563, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.518603801727295, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8808119297027588, + "num_tokens": 197005672.0, + "step": 5168 + }, + { + "epoch": 0.6575499300343468, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6433351039886475, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8555363416671753, + "num_tokens": 197042264.0, + "step": 5169 + }, + { + "epoch": 0.6576771403129373, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5247547626495361, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8619734048843384, + "num_tokens": 197085723.0, + "step": 5170 + }, + { + "epoch": 0.6578043505915278, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5316826105117798, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8612651824951172, + "num_tokens": 197127319.0, + "step": 5171 + }, + { + "epoch": 0.6579315608701183, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7054030895233154, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8651418685913086, + "num_tokens": 197162426.0, + "step": 5172 + }, + { + "epoch": 0.6580587711487088, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.508836030960083, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8777626752853394, + "num_tokens": 197202875.0, + "step": 5173 + }, + { + "epoch": 0.6581859814272993, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.7173936367034912, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8347626328468323, + "num_tokens": 197240967.0, + "step": 5174 + }, + { + "epoch": 0.6583131917058899, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.571403980255127, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8588196635246277, + "num_tokens": 197281351.0, + "step": 5175 + }, + { + "epoch": 0.6584404019844804, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4925689697265625, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8634718656539917, + "num_tokens": 197319377.0, + "step": 5176 + }, + { + "epoch": 0.6585676122630708, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.5776909589767456, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8787940144538879, + "num_tokens": 197355860.0, + "step": 5177 + }, + { + "epoch": 0.6586948225416613, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.5199311971664429, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8688989877700806, + "num_tokens": 197393053.0, + "step": 5178 + }, + { + "epoch": 0.6588220328202519, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.5199164152145386, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8548935651779175, + "num_tokens": 197432244.0, + "step": 5179 + }, + { + "epoch": 0.6589492430988424, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.678505778312683, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8560934662818909, + "num_tokens": 197466657.0, + "step": 5180 + }, + { + "epoch": 0.6590764533774329, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.805435061454773, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8581775426864624, + "num_tokens": 197501443.0, + "step": 5181 + }, + { + "epoch": 0.6592036636560235, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5528485774993896, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8650767803192139, + "num_tokens": 197541589.0, + "step": 5182 + }, + { + "epoch": 0.6593308739346139, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6059226989746094, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8557413220405579, + "num_tokens": 197582790.0, + "step": 5183 + }, + { + "epoch": 0.6594580842132044, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.5388823747634888, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8653824329376221, + "num_tokens": 197623270.0, + "step": 5184 + }, + { + "epoch": 0.6595852944917949, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.6462244987487793, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.855793297290802, + "num_tokens": 197661865.0, + "step": 5185 + }, + { + "epoch": 0.6597125047703855, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.699446678161621, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8373419046401978, + "num_tokens": 197702802.0, + "step": 5186 + }, + { + "epoch": 0.659839715048976, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.4472991228103638, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8729783892631531, + "num_tokens": 197745809.0, + "step": 5187 + }, + { + "epoch": 0.6599669253275665, + "ewc_loss": 1.5497207641601562e-05, + "grad_norm": 1.71372652053833, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8753212690353394, + "num_tokens": 197778862.0, + "step": 5188 + }, + { + "epoch": 0.6600941356061569, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.765405297279358, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8573266267776489, + "num_tokens": 197813848.0, + "step": 5189 + }, + { + "epoch": 0.6602213458847475, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.4936234951019287, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8679723739624023, + "num_tokens": 197856815.0, + "step": 5190 + }, + { + "epoch": 0.660348556163338, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7110681533813477, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.855279266834259, + "num_tokens": 197888826.0, + "step": 5191 + }, + { + "epoch": 0.6604757664419285, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5734806060791016, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8741509914398193, + "num_tokens": 197927034.0, + "step": 5192 + }, + { + "epoch": 0.660602976720519, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5686408281326294, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.858501672744751, + "num_tokens": 197966494.0, + "step": 5193 + }, + { + "epoch": 0.6607301869991096, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.8362089395523071, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8655453324317932, + "num_tokens": 198000027.0, + "step": 5194 + }, + { + "epoch": 0.6608573972777, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5813573598861694, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8616036176681519, + "num_tokens": 198037822.0, + "step": 5195 + }, + { + "epoch": 0.6609846075562905, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5926421880722046, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8526135683059692, + "num_tokens": 198078370.0, + "step": 5196 + }, + { + "epoch": 0.661111817834881, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6802436113357544, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8495371341705322, + "num_tokens": 198119150.0, + "step": 5197 + }, + { + "epoch": 0.6612390281134716, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6050186157226562, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8595173358917236, + "num_tokens": 198155951.0, + "step": 5198 + }, + { + "epoch": 0.6613662383920621, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5576504468917847, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8624321222305298, + "num_tokens": 198196722.0, + "step": 5199 + }, + { + "epoch": 0.6614934486706526, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7535086870193481, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8401468396186829, + "num_tokens": 198234264.0, + "step": 5200 + }, + { + "epoch": 0.661620658949243, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.61615788936615, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8597877025604248, + "num_tokens": 198266747.0, + "step": 5201 + }, + { + "epoch": 0.6617478692278336, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5512285232543945, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8653582334518433, + "num_tokens": 198304205.0, + "step": 5202 + }, + { + "epoch": 0.6618750795064241, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5127835273742676, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8566200733184814, + "num_tokens": 198349603.0, + "step": 5203 + }, + { + "epoch": 0.6620022897850146, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5197104215621948, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8597214221954346, + "num_tokens": 198390905.0, + "step": 5204 + }, + { + "epoch": 0.6621295000636052, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6420937776565552, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8519936203956604, + "num_tokens": 198432157.0, + "step": 5205 + }, + { + "epoch": 0.6622567103421957, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6393632888793945, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.849882960319519, + "num_tokens": 198469669.0, + "step": 5206 + }, + { + "epoch": 0.6623839206207861, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6889539957046509, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8640258312225342, + "num_tokens": 198502302.0, + "step": 5207 + }, + { + "epoch": 0.6625111308993766, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7094264030456543, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8625204563140869, + "num_tokens": 198544375.0, + "step": 5208 + }, + { + "epoch": 0.6626383411779672, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.700069785118103, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8507380485534668, + "num_tokens": 198584142.0, + "step": 5209 + }, + { + "epoch": 0.6627655514565577, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5243186950683594, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8628300428390503, + "num_tokens": 198625745.0, + "step": 5210 + }, + { + "epoch": 0.6628927617351482, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6437398195266724, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8856638669967651, + "num_tokens": 198662058.0, + "step": 5211 + }, + { + "epoch": 0.6630199720137387, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5131443738937378, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8767566680908203, + "num_tokens": 198703829.0, + "step": 5212 + }, + { + "epoch": 0.6631471822923293, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5654268264770508, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8635189533233643, + "num_tokens": 198747748.0, + "step": 5213 + }, + { + "epoch": 0.6632743925709197, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7066195011138916, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8549960851669312, + "num_tokens": 198788800.0, + "step": 5214 + }, + { + "epoch": 0.6634016028495102, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.61068594455719, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8696749806404114, + "num_tokens": 198825088.0, + "step": 5215 + }, + { + "epoch": 0.6635288131281007, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5018771886825562, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8692281246185303, + "num_tokens": 198866764.0, + "step": 5216 + }, + { + "epoch": 0.6636560234066913, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6690164804458618, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8636900186538696, + "num_tokens": 198904120.0, + "step": 5217 + }, + { + "epoch": 0.6637832336852818, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6724501848220825, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8622263073921204, + "num_tokens": 198941426.0, + "step": 5218 + }, + { + "epoch": 0.6639104439638723, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7391886711120605, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8674114346504211, + "num_tokens": 198975997.0, + "step": 5219 + }, + { + "epoch": 0.6640376542424627, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.538425326347351, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8643218278884888, + "num_tokens": 199016400.0, + "step": 5220 + }, + { + "epoch": 0.6641648645210533, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.682182788848877, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8513542413711548, + "num_tokens": 199056499.0, + "step": 5221 + }, + { + "epoch": 0.6642920747996438, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5314315557479858, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.868328332901001, + "num_tokens": 199096475.0, + "step": 5222 + }, + { + "epoch": 0.6644192850782343, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.585728645324707, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8447932004928589, + "num_tokens": 199139476.0, + "step": 5223 + }, + { + "epoch": 0.6645464953568249, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5872442722320557, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8534467220306396, + "num_tokens": 199177108.0, + "step": 5224 + }, + { + "epoch": 0.6646737056354154, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.4460079669952393, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8736861348152161, + "num_tokens": 199219538.0, + "step": 5225 + }, + { + "epoch": 0.6648009159140058, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.630270004272461, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8709172010421753, + "num_tokens": 199254649.0, + "step": 5226 + }, + { + "epoch": 0.6649281261925963, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.5179764032363892, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.863611102104187, + "num_tokens": 199293206.0, + "step": 5227 + }, + { + "epoch": 0.6650553364711869, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.5659856796264648, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8734626770019531, + "num_tokens": 199328430.0, + "step": 5228 + }, + { + "epoch": 0.6651825467497774, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.583115577697754, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8695696592330933, + "num_tokens": 199364051.0, + "step": 5229 + }, + { + "epoch": 0.6653097570283679, + "ewc_loss": 1.5616416931152344e-05, + "grad_norm": 1.7845683097839355, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8676254749298096, + "num_tokens": 199397389.0, + "step": 5230 + }, + { + "epoch": 0.6654369673069584, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7087242603302002, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8671884536743164, + "num_tokens": 199432867.0, + "step": 5231 + }, + { + "epoch": 0.6655641775855489, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6345101594924927, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8606742024421692, + "num_tokens": 199470389.0, + "step": 5232 + }, + { + "epoch": 0.6656913878641394, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6391394138336182, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.878369152545929, + "num_tokens": 199505843.0, + "step": 5233 + }, + { + "epoch": 0.6658185981427299, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5621076822280884, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8742527961730957, + "num_tokens": 199538672.0, + "step": 5234 + }, + { + "epoch": 0.6659458084213205, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.4591890573501587, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8716257810592651, + "num_tokens": 199581483.0, + "step": 5235 + }, + { + "epoch": 0.666073018699911, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.603377342224121, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8534979820251465, + "num_tokens": 199627542.0, + "step": 5236 + }, + { + "epoch": 0.6662002289785015, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6502870321273804, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8684420585632324, + "num_tokens": 199661933.0, + "step": 5237 + }, + { + "epoch": 0.6663274392570919, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6329790353775024, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8474932909011841, + "num_tokens": 199700872.0, + "step": 5238 + }, + { + "epoch": 0.6664546495356825, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5555466413497925, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8656006455421448, + "num_tokens": 199738949.0, + "step": 5239 + }, + { + "epoch": 0.666581859814273, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7108564376831055, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.843035101890564, + "num_tokens": 199776556.0, + "step": 5240 + }, + { + "epoch": 0.6667090700928635, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6447031497955322, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8407022356987, + "num_tokens": 199817178.0, + "step": 5241 + }, + { + "epoch": 0.666836280371454, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.61970055103302, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8627233505249023, + "num_tokens": 199852874.0, + "step": 5242 + }, + { + "epoch": 0.6669634906500446, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.666509747505188, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8503193855285645, + "num_tokens": 199888738.0, + "step": 5243 + }, + { + "epoch": 0.667090700928635, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6376228332519531, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8663488626480103, + "num_tokens": 199930193.0, + "step": 5244 + }, + { + "epoch": 0.6672179112072255, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6991682052612305, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8653318285942078, + "num_tokens": 199962504.0, + "step": 5245 + }, + { + "epoch": 0.667345121485816, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.3827286958694458, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.882962703704834, + "num_tokens": 200004669.0, + "step": 5246 + }, + { + "epoch": 0.6674723317644066, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6248620748519897, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8717271685600281, + "num_tokens": 200038527.0, + "step": 5247 + }, + { + "epoch": 0.6675995420429971, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.563849925994873, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8608900308609009, + "num_tokens": 200073746.0, + "step": 5248 + }, + { + "epoch": 0.6677267523215876, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 2.1681771278381348, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8781725168228149, + "num_tokens": 200109582.0, + "step": 5249 + }, + { + "epoch": 0.667853962600178, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6195939779281616, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8682740330696106, + "num_tokens": 200145376.0, + "step": 5250 + }, + { + "epoch": 0.6679811728787686, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5485962629318237, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8538157343864441, + "num_tokens": 200183903.0, + "step": 5251 + }, + { + "epoch": 0.6681083831573591, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.587430477142334, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8601506352424622, + "num_tokens": 200223713.0, + "step": 5252 + }, + { + "epoch": 0.6682355934359496, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6305896043777466, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8469414710998535, + "num_tokens": 200265187.0, + "step": 5253 + }, + { + "epoch": 0.6683628037145402, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6031780242919922, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8621156811714172, + "num_tokens": 200301150.0, + "step": 5254 + }, + { + "epoch": 0.6684900139931307, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6433742046356201, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8629864454269409, + "num_tokens": 200336147.0, + "step": 5255 + }, + { + "epoch": 0.6686172242717211, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5906480550765991, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8638374209403992, + "num_tokens": 200371104.0, + "step": 5256 + }, + { + "epoch": 0.6687444345503116, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.4789605140686035, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8632683753967285, + "num_tokens": 200414974.0, + "step": 5257 + }, + { + "epoch": 0.6688716448289022, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5869807004928589, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.869093656539917, + "num_tokens": 200457167.0, + "step": 5258 + }, + { + "epoch": 0.6689988551074927, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7067384719848633, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.870718777179718, + "num_tokens": 200488847.0, + "step": 5259 + }, + { + "epoch": 0.6691260653860832, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.683754801750183, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8585957884788513, + "num_tokens": 200522254.0, + "step": 5260 + }, + { + "epoch": 0.6692532756646737, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5222762823104858, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8763803839683533, + "num_tokens": 200559957.0, + "step": 5261 + }, + { + "epoch": 0.6693804859432643, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.7118635177612305, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.838708758354187, + "num_tokens": 200598027.0, + "step": 5262 + }, + { + "epoch": 0.6695076962218547, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5482639074325562, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8764067888259888, + "num_tokens": 200639280.0, + "step": 5263 + }, + { + "epoch": 0.6696349065004452, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.5545886754989624, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8692252039909363, + "num_tokens": 200679988.0, + "step": 5264 + }, + { + "epoch": 0.6697621167790357, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.62241530418396, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8563045263290405, + "num_tokens": 200716483.0, + "step": 5265 + }, + { + "epoch": 0.6698893270576263, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.641539454460144, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8468157052993774, + "num_tokens": 200756609.0, + "step": 5266 + }, + { + "epoch": 0.6700165373362168, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6610867977142334, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8606646656990051, + "num_tokens": 200795175.0, + "step": 5267 + }, + { + "epoch": 0.6701437476148073, + "ewc_loss": 1.5735626220703125e-05, + "grad_norm": 1.6027672290802002, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8474763631820679, + "num_tokens": 200837109.0, + "step": 5268 + }, + { + "epoch": 0.6702709578933977, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.5249496698379517, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8637471199035645, + "num_tokens": 200874285.0, + "step": 5269 + }, + { + "epoch": 0.6703981681719883, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.6184197664260864, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8545049428939819, + "num_tokens": 200911572.0, + "step": 5270 + }, + { + "epoch": 0.6705253784505788, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.474061131477356, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8683425188064575, + "num_tokens": 200953913.0, + "step": 5271 + }, + { + "epoch": 0.6706525887291693, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.4845538139343262, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8577101230621338, + "num_tokens": 200997004.0, + "step": 5272 + }, + { + "epoch": 0.6707797990077599, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.5281928777694702, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8676042556762695, + "num_tokens": 201036786.0, + "step": 5273 + }, + { + "epoch": 0.6709070092863504, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.5484955310821533, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8672665953636169, + "num_tokens": 201072440.0, + "step": 5274 + }, + { + "epoch": 0.6710342195649408, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.6214581727981567, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8596731424331665, + "num_tokens": 201110460.0, + "step": 5275 + }, + { + "epoch": 0.6711614298435313, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.6216092109680176, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8553365468978882, + "num_tokens": 201147849.0, + "step": 5276 + }, + { + "epoch": 0.6712886401221219, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.6210107803344727, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8659201264381409, + "num_tokens": 201183524.0, + "step": 5277 + }, + { + "epoch": 0.6714158504007124, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.6739370822906494, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8567754030227661, + "num_tokens": 201224311.0, + "step": 5278 + }, + { + "epoch": 0.6715430606793029, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.6674119234085083, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8515045642852783, + "num_tokens": 201259641.0, + "step": 5279 + }, + { + "epoch": 0.6716702709578934, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.5885218381881714, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8439725637435913, + "num_tokens": 201302187.0, + "step": 5280 + }, + { + "epoch": 0.6717974812364839, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.6000525951385498, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8762074708938599, + "num_tokens": 201339857.0, + "step": 5281 + }, + { + "epoch": 0.6719246915150744, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.6087424755096436, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8539003133773804, + "num_tokens": 201380694.0, + "step": 5282 + }, + { + "epoch": 0.6720519017936649, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6354752779006958, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8595439791679382, + "num_tokens": 201419128.0, + "step": 5283 + }, + { + "epoch": 0.6721791120722554, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.670060396194458, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8548797965049744, + "num_tokens": 201456723.0, + "step": 5284 + }, + { + "epoch": 0.672306322350846, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5279037952423096, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8684607744216919, + "num_tokens": 201497285.0, + "step": 5285 + }, + { + "epoch": 0.6724335326294365, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5011472702026367, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8659370541572571, + "num_tokens": 201539158.0, + "step": 5286 + }, + { + "epoch": 0.6725607429080269, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.4667128324508667, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8753426671028137, + "num_tokens": 201580250.0, + "step": 5287 + }, + { + "epoch": 0.6726879531866174, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6636080741882324, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8657265305519104, + "num_tokens": 201621060.0, + "step": 5288 + }, + { + "epoch": 0.672815163465208, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.597415804862976, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8696392178535461, + "num_tokens": 201659229.0, + "step": 5289 + }, + { + "epoch": 0.6729423737437985, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5778076648712158, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8560876846313477, + "num_tokens": 201702507.0, + "step": 5290 + }, + { + "epoch": 0.673069584022389, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6732937097549438, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8602334260940552, + "num_tokens": 201738098.0, + "step": 5291 + }, + { + "epoch": 0.6731967943009796, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.7416965961456299, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8625816702842712, + "num_tokens": 201776864.0, + "step": 5292 + }, + { + "epoch": 0.67332400457957, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.546566128730774, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8554944396018982, + "num_tokens": 201819891.0, + "step": 5293 + }, + { + "epoch": 0.6734512148581605, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.489431381225586, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8747611045837402, + "num_tokens": 201858981.0, + "step": 5294 + }, + { + "epoch": 0.673578425136751, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5974940061569214, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8582357168197632, + "num_tokens": 201897305.0, + "step": 5295 + }, + { + "epoch": 0.6737056354153416, + "ewc_loss": 1.5974044799804688e-05, + "grad_norm": 1.602244257926941, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8539732694625854, + "num_tokens": 201938904.0, + "step": 5296 + }, + { + "epoch": 0.6738328456939321, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.856552004814148, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.863417387008667, + "num_tokens": 201969527.0, + "step": 5297 + }, + { + "epoch": 0.6739600559725226, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.7368518114089966, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8597112894058228, + "num_tokens": 202003820.0, + "step": 5298 + }, + { + "epoch": 0.674087266251113, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5740739107131958, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8485286235809326, + "num_tokens": 202049151.0, + "step": 5299 + }, + { + "epoch": 0.6742144765297036, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5765248537063599, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8509551286697388, + "num_tokens": 202093848.0, + "step": 5300 + }, + { + "epoch": 0.6743416868082941, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7042624950408936, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8575306534767151, + "num_tokens": 202129691.0, + "step": 5301 + }, + { + "epoch": 0.6744688970868846, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5205856561660767, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8650486469268799, + "num_tokens": 202175431.0, + "step": 5302 + }, + { + "epoch": 0.6745961073654752, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5908492803573608, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8606940507888794, + "num_tokens": 202215308.0, + "step": 5303 + }, + { + "epoch": 0.6747233176440657, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.7313467264175415, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8625792264938354, + "num_tokens": 202250235.0, + "step": 5304 + }, + { + "epoch": 0.6748505279226561, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.546230435371399, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8433685302734375, + "num_tokens": 202288535.0, + "step": 5305 + }, + { + "epoch": 0.6749777382012466, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6996791362762451, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.857543408870697, + "num_tokens": 202324329.0, + "step": 5306 + }, + { + "epoch": 0.6751049484798372, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5465253591537476, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.86346834897995, + "num_tokens": 202361934.0, + "step": 5307 + }, + { + "epoch": 0.6752321587584277, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5326062440872192, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8623853325843811, + "num_tokens": 202404400.0, + "step": 5308 + }, + { + "epoch": 0.6753593690370182, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6041433811187744, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8515045642852783, + "num_tokens": 202445503.0, + "step": 5309 + }, + { + "epoch": 0.6754865793156087, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.4965299367904663, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8693327307701111, + "num_tokens": 202485204.0, + "step": 5310 + }, + { + "epoch": 0.6756137895941993, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6917452812194824, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8611983060836792, + "num_tokens": 202529320.0, + "step": 5311 + }, + { + "epoch": 0.6757409998727897, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5931932926177979, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8611741065979004, + "num_tokens": 202569917.0, + "step": 5312 + }, + { + "epoch": 0.6758682101513802, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6089240312576294, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8788667321205139, + "num_tokens": 202605222.0, + "step": 5313 + }, + { + "epoch": 0.6759954204299707, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7187788486480713, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8506582975387573, + "num_tokens": 202647122.0, + "step": 5314 + }, + { + "epoch": 0.6761226307085613, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6651806831359863, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8543450832366943, + "num_tokens": 202683538.0, + "step": 5315 + }, + { + "epoch": 0.6762498409871518, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6036285161972046, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8696811199188232, + "num_tokens": 202718490.0, + "step": 5316 + }, + { + "epoch": 0.6763770512657423, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.683408498764038, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8443173170089722, + "num_tokens": 202759446.0, + "step": 5317 + }, + { + "epoch": 0.6765042615443327, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5897951126098633, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8664116859436035, + "num_tokens": 202799758.0, + "step": 5318 + }, + { + "epoch": 0.6766314718229233, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5304679870605469, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8531962037086487, + "num_tokens": 202842002.0, + "step": 5319 + }, + { + "epoch": 0.6767586821015138, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7291542291641235, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8537803888320923, + "num_tokens": 202874486.0, + "step": 5320 + }, + { + "epoch": 0.6768858923801043, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6438153982162476, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8625948429107666, + "num_tokens": 202905299.0, + "step": 5321 + }, + { + "epoch": 0.6770131026586949, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6244350671768188, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8795642852783203, + "num_tokens": 202940053.0, + "step": 5322 + }, + { + "epoch": 0.6771403129372854, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.4596929550170898, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8798208236694336, + "num_tokens": 202982430.0, + "step": 5323 + }, + { + "epoch": 0.6772675232158758, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5415352582931519, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8627238869667053, + "num_tokens": 203021706.0, + "step": 5324 + }, + { + "epoch": 0.6773947334944663, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.7467375993728638, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8566785454750061, + "num_tokens": 203060911.0, + "step": 5325 + }, + { + "epoch": 0.6775219437730569, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6126909255981445, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8715402483940125, + "num_tokens": 203099604.0, + "step": 5326 + }, + { + "epoch": 0.6776491540516474, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5566972494125366, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8559072017669678, + "num_tokens": 203147190.0, + "step": 5327 + }, + { + "epoch": 0.6777763643302379, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6566616296768188, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8707684874534607, + "num_tokens": 203181716.0, + "step": 5328 + }, + { + "epoch": 0.6779035746088284, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.658219337463379, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8546609282493591, + "num_tokens": 203223773.0, + "step": 5329 + }, + { + "epoch": 0.6780307848874189, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6867941617965698, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8561131954193115, + "num_tokens": 203260331.0, + "step": 5330 + }, + { + "epoch": 0.6781579951660094, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6566696166992188, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8602307438850403, + "num_tokens": 203300150.0, + "step": 5331 + }, + { + "epoch": 0.6782852054445999, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5258440971374512, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8679301142692566, + "num_tokens": 203341697.0, + "step": 5332 + }, + { + "epoch": 0.6784124157231904, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.610687255859375, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.849773645401001, + "num_tokens": 203383015.0, + "step": 5333 + }, + { + "epoch": 0.678539626001781, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6456258296966553, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8447911739349365, + "num_tokens": 203422633.0, + "step": 5334 + }, + { + "epoch": 0.6786668362803715, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6409512758255005, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8712725639343262, + "num_tokens": 203458744.0, + "step": 5335 + }, + { + "epoch": 0.6787940465589619, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.816848635673523, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8539414405822754, + "num_tokens": 203492856.0, + "step": 5336 + }, + { + "epoch": 0.6789212568375524, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6931430101394653, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8626372814178467, + "num_tokens": 203525524.0, + "step": 5337 + }, + { + "epoch": 0.679048467116143, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5525619983673096, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8589035868644714, + "num_tokens": 203563980.0, + "step": 5338 + }, + { + "epoch": 0.6791756773947335, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5195420980453491, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.869318962097168, + "num_tokens": 203604729.0, + "step": 5339 + }, + { + "epoch": 0.679302887673324, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.520026683807373, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8707774877548218, + "num_tokens": 203645678.0, + "step": 5340 + }, + { + "epoch": 0.6794300979519146, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.4881190061569214, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8653526902198792, + "num_tokens": 203689783.0, + "step": 5341 + }, + { + "epoch": 0.679557308230505, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.7534548044204712, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8633785247802734, + "num_tokens": 203721609.0, + "step": 5342 + }, + { + "epoch": 0.6796845185090955, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.9089316129684448, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8563358783721924, + "num_tokens": 203750605.0, + "step": 5343 + }, + { + "epoch": 0.679811728787686, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.4877278804779053, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8769761919975281, + "num_tokens": 203795751.0, + "step": 5344 + }, + { + "epoch": 0.6799389390662766, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.560479998588562, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8652318716049194, + "num_tokens": 203837722.0, + "step": 5345 + }, + { + "epoch": 0.6800661493448671, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6581894159317017, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8710848689079285, + "num_tokens": 203872722.0, + "step": 5346 + }, + { + "epoch": 0.6801933596234576, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6352850198745728, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8591530323028564, + "num_tokens": 203910313.0, + "step": 5347 + }, + { + "epoch": 0.680320569902048, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5560675859451294, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8604093194007874, + "num_tokens": 203949253.0, + "step": 5348 + }, + { + "epoch": 0.6804477801806386, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6390069723129272, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8585529327392578, + "num_tokens": 203989647.0, + "step": 5349 + }, + { + "epoch": 0.6805749904592291, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.7487425804138184, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8589199781417847, + "num_tokens": 204027454.0, + "step": 5350 + }, + { + "epoch": 0.6807022007378196, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.6413646936416626, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8652533292770386, + "num_tokens": 204068819.0, + "step": 5351 + }, + { + "epoch": 0.6808294110164101, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.7205164432525635, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8622472882270813, + "num_tokens": 204101355.0, + "step": 5352 + }, + { + "epoch": 0.6809566212950007, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5524400472640991, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8538131713867188, + "num_tokens": 204142413.0, + "step": 5353 + }, + { + "epoch": 0.6810838315735911, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.669070839881897, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.856554388999939, + "num_tokens": 204179311.0, + "step": 5354 + }, + { + "epoch": 0.6812110418521816, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5799955129623413, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8485840559005737, + "num_tokens": 204219682.0, + "step": 5355 + }, + { + "epoch": 0.6813382521307721, + "ewc_loss": 1.609325408935547e-05, + "grad_norm": 1.5436433553695679, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8663623332977295, + "num_tokens": 204257457.0, + "step": 5356 + }, + { + "epoch": 0.6814654624093627, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7623441219329834, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8658069372177124, + "num_tokens": 204290737.0, + "step": 5357 + }, + { + "epoch": 0.6815926726879532, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7190437316894531, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8608667254447937, + "num_tokens": 204324206.0, + "step": 5358 + }, + { + "epoch": 0.6817198829665437, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5336636304855347, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8628848791122437, + "num_tokens": 204362036.0, + "step": 5359 + }, + { + "epoch": 0.6818470932451343, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6841791868209839, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8467520475387573, + "num_tokens": 204399629.0, + "step": 5360 + }, + { + "epoch": 0.6819743035237247, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.562972903251648, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8605962991714478, + "num_tokens": 204439292.0, + "step": 5361 + }, + { + "epoch": 0.6821015138023152, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6742585897445679, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8824515342712402, + "num_tokens": 204476590.0, + "step": 5362 + }, + { + "epoch": 0.6822287240809057, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.644170880317688, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8607760667800903, + "num_tokens": 204516017.0, + "step": 5363 + }, + { + "epoch": 0.6823559343594963, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6166044473648071, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8623706698417664, + "num_tokens": 204553753.0, + "step": 5364 + }, + { + "epoch": 0.6824831446380868, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5371263027191162, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8541268706321716, + "num_tokens": 204598816.0, + "step": 5365 + }, + { + "epoch": 0.6826103549166773, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5248074531555176, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8614487051963806, + "num_tokens": 204636476.0, + "step": 5366 + }, + { + "epoch": 0.6827375651952677, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.513797640800476, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8673625588417053, + "num_tokens": 204676532.0, + "step": 5367 + }, + { + "epoch": 0.6828647754738583, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5523662567138672, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8530128002166748, + "num_tokens": 204716628.0, + "step": 5368 + }, + { + "epoch": 0.6829919857524488, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5144771337509155, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8686070442199707, + "num_tokens": 204755090.0, + "step": 5369 + }, + { + "epoch": 0.6831191960310393, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6339102983474731, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8697974681854248, + "num_tokens": 204788745.0, + "step": 5370 + }, + { + "epoch": 0.6832464063096299, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7279630899429321, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8688951730728149, + "num_tokens": 204823338.0, + "step": 5371 + }, + { + "epoch": 0.6833736165882204, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5194555521011353, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8653741478919983, + "num_tokens": 204863024.0, + "step": 5372 + }, + { + "epoch": 0.6835008268668108, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5729706287384033, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.863838791847229, + "num_tokens": 204905414.0, + "step": 5373 + }, + { + "epoch": 0.6836280371454013, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5965551137924194, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8530844449996948, + "num_tokens": 204946960.0, + "step": 5374 + }, + { + "epoch": 0.6837552474239919, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5811378955841064, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8665030002593994, + "num_tokens": 204986615.0, + "step": 5375 + }, + { + "epoch": 0.6838824577025824, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7605938911437988, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8563665151596069, + "num_tokens": 205019852.0, + "step": 5376 + }, + { + "epoch": 0.6840096679811729, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7156211137771606, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8420208692550659, + "num_tokens": 205060861.0, + "step": 5377 + }, + { + "epoch": 0.6841368782597634, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5841176509857178, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8639395236968994, + "num_tokens": 205099046.0, + "step": 5378 + }, + { + "epoch": 0.6842640885383539, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5718433856964111, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8545256853103638, + "num_tokens": 205141709.0, + "step": 5379 + }, + { + "epoch": 0.6843912988169444, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6206591129302979, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8647477030754089, + "num_tokens": 205179025.0, + "step": 5380 + }, + { + "epoch": 0.6845185090955349, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5862499475479126, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8568298816680908, + "num_tokens": 205219524.0, + "step": 5381 + }, + { + "epoch": 0.6846457193741254, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.655356764793396, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8695482015609741, + "num_tokens": 205254717.0, + "step": 5382 + }, + { + "epoch": 0.684772929652716, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.544515609741211, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8519927263259888, + "num_tokens": 205301739.0, + "step": 5383 + }, + { + "epoch": 0.6849001399313065, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.4727939367294312, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8763916492462158, + "num_tokens": 205339815.0, + "step": 5384 + }, + { + "epoch": 0.6850273502098969, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5839605331420898, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8529135584831238, + "num_tokens": 205378594.0, + "step": 5385 + }, + { + "epoch": 0.6851545604884874, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5063999891281128, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8563486933708191, + "num_tokens": 205419922.0, + "step": 5386 + }, + { + "epoch": 0.685281770767078, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.4222370386123657, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8851181268692017, + "num_tokens": 205459738.0, + "step": 5387 + }, + { + "epoch": 0.6854089810456685, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5798189640045166, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8853493928909302, + "num_tokens": 205497358.0, + "step": 5388 + }, + { + "epoch": 0.685536191324259, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6484941244125366, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8663604855537415, + "num_tokens": 205534079.0, + "step": 5389 + }, + { + "epoch": 0.6856634016028496, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6331323385238647, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8640128970146179, + "num_tokens": 205571925.0, + "step": 5390 + }, + { + "epoch": 0.68579061188144, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.4792965650558472, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8580499887466431, + "num_tokens": 205615363.0, + "step": 5391 + }, + { + "epoch": 0.6859178221600305, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6380325555801392, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.865411639213562, + "num_tokens": 205652477.0, + "step": 5392 + }, + { + "epoch": 0.686045032438621, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.471686840057373, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8857422471046448, + "num_tokens": 205688276.0, + "step": 5393 + }, + { + "epoch": 0.6861722427172116, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.540315866470337, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8770741820335388, + "num_tokens": 205723432.0, + "step": 5394 + }, + { + "epoch": 0.6862994529958021, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6819195747375488, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8760007619857788, + "num_tokens": 205758051.0, + "step": 5395 + }, + { + "epoch": 0.6864266632743926, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5742449760437012, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8739449381828308, + "num_tokens": 205795643.0, + "step": 5396 + }, + { + "epoch": 0.686553873552983, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7890703678131104, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8405653238296509, + "num_tokens": 205833087.0, + "step": 5397 + }, + { + "epoch": 0.6866810838315736, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.665381908416748, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8478460311889648, + "num_tokens": 205871481.0, + "step": 5398 + }, + { + "epoch": 0.6868082941101641, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.728042721748352, + "learning_rate": 1e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8325337767601013, + "num_tokens": 205908491.0, + "step": 5399 + }, + { + "epoch": 0.6869355043887546, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5455682277679443, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8702597618103027, + "num_tokens": 205946386.0, + "step": 5400 + }, + { + "epoch": 0.6870627146673451, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5624020099639893, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8516106605529785, + "num_tokens": 205986856.0, + "step": 5401 + }, + { + "epoch": 0.6871899249459357, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.558781385421753, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8662248253822327, + "num_tokens": 206025708.0, + "step": 5402 + }, + { + "epoch": 0.6873171352245261, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.578568458557129, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.866349458694458, + "num_tokens": 206062588.0, + "step": 5403 + }, + { + "epoch": 0.6874443455031166, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5616753101348877, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8684512972831726, + "num_tokens": 206103393.0, + "step": 5404 + }, + { + "epoch": 0.6875715557817071, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.4898309707641602, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8709330558776855, + "num_tokens": 206146923.0, + "step": 5405 + }, + { + "epoch": 0.6876987660602977, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7354965209960938, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8724716901779175, + "num_tokens": 206179093.0, + "step": 5406 + }, + { + "epoch": 0.6878259763388882, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6909825801849365, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8549073934555054, + "num_tokens": 206213198.0, + "step": 5407 + }, + { + "epoch": 0.6879531866174787, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6628029346466064, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8468389511108398, + "num_tokens": 206250273.0, + "step": 5408 + }, + { + "epoch": 0.6880803968960693, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7103124856948853, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8613148927688599, + "num_tokens": 206286711.0, + "step": 5409 + }, + { + "epoch": 0.6882076071746597, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.563157558441162, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.877424955368042, + "num_tokens": 206323572.0, + "step": 5410 + }, + { + "epoch": 0.6883348174532502, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6399859189987183, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8744972348213196, + "num_tokens": 206355974.0, + "step": 5411 + }, + { + "epoch": 0.6884620277318407, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.7315125465393066, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8497776389122009, + "num_tokens": 206390570.0, + "step": 5412 + }, + { + "epoch": 0.6885892380104313, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.4985947608947754, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8828999400138855, + "num_tokens": 206425183.0, + "step": 5413 + }, + { + "epoch": 0.6887164482890218, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.592486023902893, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8716607093811035, + "num_tokens": 206461954.0, + "step": 5414 + }, + { + "epoch": 0.6888436585676123, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.564644455909729, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8647158145904541, + "num_tokens": 206500917.0, + "step": 5415 + }, + { + "epoch": 0.6889708688462027, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.805100679397583, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8401923179626465, + "num_tokens": 206535583.0, + "step": 5416 + }, + { + "epoch": 0.6890980791247933, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6314975023269653, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8640632629394531, + "num_tokens": 206572587.0, + "step": 5417 + }, + { + "epoch": 0.6892252894033838, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.645117163658142, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8497694730758667, + "num_tokens": 206609661.0, + "step": 5418 + }, + { + "epoch": 0.6893524996819743, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.7432583570480347, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8451807498931885, + "num_tokens": 206646812.0, + "step": 5419 + }, + { + "epoch": 0.6894797099605648, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.4907147884368896, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8736556768417358, + "num_tokens": 206685060.0, + "step": 5420 + }, + { + "epoch": 0.6896069202391554, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.8020367622375488, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8646533489227295, + "num_tokens": 206721626.0, + "step": 5421 + }, + { + "epoch": 0.6897341305177458, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5475800037384033, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8522380590438843, + "num_tokens": 206767972.0, + "step": 5422 + }, + { + "epoch": 0.6898613407963363, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5784187316894531, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8701122403144836, + "num_tokens": 206807150.0, + "step": 5423 + }, + { + "epoch": 0.6899885510749268, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.4731470346450806, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8664790391921997, + "num_tokens": 206851097.0, + "step": 5424 + }, + { + "epoch": 0.6901157613535174, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.7350443601608276, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.872995138168335, + "num_tokens": 206885996.0, + "step": 5425 + }, + { + "epoch": 0.6902429716321079, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6510059833526611, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8742875456809998, + "num_tokens": 206920922.0, + "step": 5426 + }, + { + "epoch": 0.6903701819106984, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5705591440200806, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8697009086608887, + "num_tokens": 206961666.0, + "step": 5427 + }, + { + "epoch": 0.6904973921892888, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.7186598777770996, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.86713707447052, + "num_tokens": 207001979.0, + "step": 5428 + }, + { + "epoch": 0.6906246024678794, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6169034242630005, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8672286868095398, + "num_tokens": 207039810.0, + "step": 5429 + }, + { + "epoch": 0.6907518127464699, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5907076597213745, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8565689325332642, + "num_tokens": 207075966.0, + "step": 5430 + }, + { + "epoch": 0.6908790230250604, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.493764042854309, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8685182332992554, + "num_tokens": 207118423.0, + "step": 5431 + }, + { + "epoch": 0.691006233303651, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.4885526895523071, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8803185224533081, + "num_tokens": 207154755.0, + "step": 5432 + }, + { + "epoch": 0.6911334435822415, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6407188177108765, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8577293157577515, + "num_tokens": 207193965.0, + "step": 5433 + }, + { + "epoch": 0.6912606538608319, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.689374327659607, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8630250692367554, + "num_tokens": 207234527.0, + "step": 5434 + }, + { + "epoch": 0.6913878641394224, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6197853088378906, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8509535193443298, + "num_tokens": 207273998.0, + "step": 5435 + }, + { + "epoch": 0.691515074418013, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.4684675931930542, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8557896614074707, + "num_tokens": 207318783.0, + "step": 5436 + }, + { + "epoch": 0.6916422846966035, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5051442384719849, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8722778558731079, + "num_tokens": 207358403.0, + "step": 5437 + }, + { + "epoch": 0.691769494975194, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.704906940460205, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8641133308410645, + "num_tokens": 207397631.0, + "step": 5438 + }, + { + "epoch": 0.6918967052537845, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.553357481956482, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8660925626754761, + "num_tokens": 207441651.0, + "step": 5439 + }, + { + "epoch": 0.692023915532375, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.4540859460830688, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8768413662910461, + "num_tokens": 207483235.0, + "step": 5440 + }, + { + "epoch": 0.6921511258109655, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.536373257637024, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8740401268005371, + "num_tokens": 207521871.0, + "step": 5441 + }, + { + "epoch": 0.692278336089556, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5118813514709473, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8702304363250732, + "num_tokens": 207561717.0, + "step": 5442 + }, + { + "epoch": 0.6924055463681466, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5750973224639893, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8724690675735474, + "num_tokens": 207603684.0, + "step": 5443 + }, + { + "epoch": 0.6925327566467371, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6773098707199097, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8662170171737671, + "num_tokens": 207638250.0, + "step": 5444 + }, + { + "epoch": 0.6926599669253276, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.4728707075119019, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8832747936248779, + "num_tokens": 207675289.0, + "step": 5445 + }, + { + "epoch": 0.692787177203918, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6517549753189087, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8652928471565247, + "num_tokens": 207710926.0, + "step": 5446 + }, + { + "epoch": 0.6929143874825086, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6592291593551636, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8666067123413086, + "num_tokens": 207747965.0, + "step": 5447 + }, + { + "epoch": 0.6930415977610991, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.5261924266815186, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8698879480361938, + "num_tokens": 207785059.0, + "step": 5448 + }, + { + "epoch": 0.6931688080396896, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6357976198196411, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8537484407424927, + "num_tokens": 207823511.0, + "step": 5449 + }, + { + "epoch": 0.6932960183182801, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.640929102897644, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.864809513092041, + "num_tokens": 207860761.0, + "step": 5450 + }, + { + "epoch": 0.6934232285968707, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6775540113449097, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8647466897964478, + "num_tokens": 207895901.0, + "step": 5451 + }, + { + "epoch": 0.6935504388754611, + "ewc_loss": 1.621246337890625e-05, + "grad_norm": 1.6221003532409668, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8503776788711548, + "num_tokens": 207940428.0, + "step": 5452 + }, + { + "epoch": 0.6936776491540516, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.49674391746521, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8557928800582886, + "num_tokens": 207983137.0, + "step": 5453 + }, + { + "epoch": 0.6938048594326421, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 3.6540017127990723, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8742133378982544, + "num_tokens": 208018425.0, + "step": 5454 + }, + { + "epoch": 0.6939320697112327, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.43816339969635, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8587434887886047, + "num_tokens": 208061474.0, + "step": 5455 + }, + { + "epoch": 0.6940592799898232, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5127569437026978, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8620048761367798, + "num_tokens": 208097963.0, + "step": 5456 + }, + { + "epoch": 0.6941864902684137, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.7816413640975952, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8613230586051941, + "num_tokens": 208130920.0, + "step": 5457 + }, + { + "epoch": 0.6943137005470043, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.534863829612732, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8593500852584839, + "num_tokens": 208170071.0, + "step": 5458 + }, + { + "epoch": 0.6944409108255947, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.545099139213562, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8559511303901672, + "num_tokens": 208208584.0, + "step": 5459 + }, + { + "epoch": 0.6945681211041852, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.9495962858200073, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8632570505142212, + "num_tokens": 208234037.0, + "step": 5460 + }, + { + "epoch": 0.6946953313827757, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5454918146133423, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8561325073242188, + "num_tokens": 208273732.0, + "step": 5461 + }, + { + "epoch": 0.6948225416613663, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.530595064163208, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8814177513122559, + "num_tokens": 208314661.0, + "step": 5462 + }, + { + "epoch": 0.6949497519399568, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5447134971618652, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8508446216583252, + "num_tokens": 208360171.0, + "step": 5463 + }, + { + "epoch": 0.6950769622185473, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6848224401474, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8586372137069702, + "num_tokens": 208395665.0, + "step": 5464 + }, + { + "epoch": 0.6952041724971377, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6478031873703003, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8441526889801025, + "num_tokens": 208436285.0, + "step": 5465 + }, + { + "epoch": 0.6953313827757283, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5720529556274414, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8705742359161377, + "num_tokens": 208471061.0, + "step": 5466 + }, + { + "epoch": 0.6954585930543188, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.3824080228805542, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8657974600791931, + "num_tokens": 208515527.0, + "step": 5467 + }, + { + "epoch": 0.6955858033329093, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6747512817382812, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8522945642471313, + "num_tokens": 208554171.0, + "step": 5468 + }, + { + "epoch": 0.6957130136114998, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.759118914604187, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8536362051963806, + "num_tokens": 208585986.0, + "step": 5469 + }, + { + "epoch": 0.6958402238900904, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.6044068336486816, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8759048581123352, + "num_tokens": 208623066.0, + "step": 5470 + }, + { + "epoch": 0.6959674341686808, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.430764079093933, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8731037378311157, + "num_tokens": 208666229.0, + "step": 5471 + }, + { + "epoch": 0.6960946444472713, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.545206904411316, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8768376111984253, + "num_tokens": 208705113.0, + "step": 5472 + }, + { + "epoch": 0.6962218547258618, + "ewc_loss": 1.6450881958007812e-05, + "grad_norm": 1.6268200874328613, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8682295083999634, + "num_tokens": 208744141.0, + "step": 5473 + }, + { + "epoch": 0.6963490650044524, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.611656665802002, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8725399374961853, + "num_tokens": 208781183.0, + "step": 5474 + }, + { + "epoch": 0.6964762752830429, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5494475364685059, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8677262663841248, + "num_tokens": 208820022.0, + "step": 5475 + }, + { + "epoch": 0.6966034855616334, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.8285224437713623, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8515797853469849, + "num_tokens": 208852682.0, + "step": 5476 + }, + { + "epoch": 0.6967306958402238, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.7098244428634644, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8701831102371216, + "num_tokens": 208883593.0, + "step": 5477 + }, + { + "epoch": 0.6968579061188144, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5885355472564697, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.849326491355896, + "num_tokens": 208925991.0, + "step": 5478 + }, + { + "epoch": 0.6969851163974049, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.635594367980957, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8539838194847107, + "num_tokens": 208963007.0, + "step": 5479 + }, + { + "epoch": 0.6971123266759954, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5759505033493042, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8644723892211914, + "num_tokens": 208999512.0, + "step": 5480 + }, + { + "epoch": 0.697239536954586, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6678489446640015, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8617864847183228, + "num_tokens": 209034205.0, + "step": 5481 + }, + { + "epoch": 0.6973667472331765, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5968234539031982, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8634623289108276, + "num_tokens": 209069875.0, + "step": 5482 + }, + { + "epoch": 0.6974939575117669, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5739496946334839, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8604358434677124, + "num_tokens": 209109410.0, + "step": 5483 + }, + { + "epoch": 0.6976211677903574, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.65444815158844, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8677248358726501, + "num_tokens": 209143219.0, + "step": 5484 + }, + { + "epoch": 0.697748378068948, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.661311149597168, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8705276250839233, + "num_tokens": 209176529.0, + "step": 5485 + }, + { + "epoch": 0.6978755883475385, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.4999395608901978, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8704422116279602, + "num_tokens": 209217090.0, + "step": 5486 + }, + { + "epoch": 0.698002798626129, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.4988347291946411, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.87409907579422, + "num_tokens": 209256511.0, + "step": 5487 + }, + { + "epoch": 0.6981300089047195, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5858150720596313, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8524590134620667, + "num_tokens": 209296438.0, + "step": 5488 + }, + { + "epoch": 0.69825721918331, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7235240936279297, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8716047406196594, + "num_tokens": 209331570.0, + "step": 5489 + }, + { + "epoch": 0.6983844294619005, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.696262240409851, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8641847372055054, + "num_tokens": 209365295.0, + "step": 5490 + }, + { + "epoch": 0.698511639740491, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5525349378585815, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8809086680412292, + "num_tokens": 209400401.0, + "step": 5491 + }, + { + "epoch": 0.6986388500190815, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.57659912109375, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8576248288154602, + "num_tokens": 209437086.0, + "step": 5492 + }, + { + "epoch": 0.6987660602976721, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6912577152252197, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8566191792488098, + "num_tokens": 209476186.0, + "step": 5493 + }, + { + "epoch": 0.6988932705762626, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.638264536857605, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8653405904769897, + "num_tokens": 209511390.0, + "step": 5494 + }, + { + "epoch": 0.699020480854853, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.4840450286865234, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.868565559387207, + "num_tokens": 209554814.0, + "step": 5495 + }, + { + "epoch": 0.6991476911334435, + "ewc_loss": 1.633167266845703e-05, + "grad_norm": 1.5533379316329956, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8601045608520508, + "num_tokens": 209599144.0, + "step": 5496 + }, + { + "epoch": 0.6992749014120341, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 3.7046356201171875, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8345127105712891, + "num_tokens": 209633816.0, + "step": 5497 + }, + { + "epoch": 0.6994021116906246, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5597048997879028, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8661026954650879, + "num_tokens": 209670239.0, + "step": 5498 + }, + { + "epoch": 0.6995293219692151, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6368523836135864, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8565901517868042, + "num_tokens": 209710613.0, + "step": 5499 + }, + { + "epoch": 0.6996565322478057, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6496013402938843, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8650081157684326, + "num_tokens": 209745839.0, + "step": 5500 + }, + { + "epoch": 0.6997837425263961, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6098508834838867, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8591295480728149, + "num_tokens": 209781845.0, + "step": 5501 + }, + { + "epoch": 0.6999109528049866, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5102975368499756, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8772023916244507, + "num_tokens": 209817792.0, + "step": 5502 + }, + { + "epoch": 0.7000381630835771, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6190696954727173, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8762661218643188, + "num_tokens": 209853178.0, + "step": 5503 + }, + { + "epoch": 0.7001653733621677, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5889166593551636, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8637535572052002, + "num_tokens": 209891684.0, + "step": 5504 + }, + { + "epoch": 0.7002925836407582, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7627196311950684, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8502327799797058, + "num_tokens": 209930724.0, + "step": 5505 + }, + { + "epoch": 0.7004197939193487, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.679349422454834, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8691808581352234, + "num_tokens": 209967906.0, + "step": 5506 + }, + { + "epoch": 0.7005470041979391, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6859012842178345, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8770540356636047, + "num_tokens": 210002347.0, + "step": 5507 + }, + { + "epoch": 0.7006742144765297, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.4911400079727173, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8660278916358948, + "num_tokens": 210042188.0, + "step": 5508 + }, + { + "epoch": 0.7008014247551202, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6822261810302734, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8560746312141418, + "num_tokens": 210082589.0, + "step": 5509 + }, + { + "epoch": 0.7009286350337107, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.8070902824401855, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8631049394607544, + "num_tokens": 210119735.0, + "step": 5510 + }, + { + "epoch": 0.7010558453123013, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7114648818969727, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8692213296890259, + "num_tokens": 210155825.0, + "step": 5511 + }, + { + "epoch": 0.7011830555908918, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7333965301513672, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8699701428413391, + "num_tokens": 210188608.0, + "step": 5512 + }, + { + "epoch": 0.7013102658694823, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5877577066421509, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8564727306365967, + "num_tokens": 210225230.0, + "step": 5513 + }, + { + "epoch": 0.7014374761480727, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.503114104270935, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8663793802261353, + "num_tokens": 210265263.0, + "step": 5514 + }, + { + "epoch": 0.7015646864266633, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6787253618240356, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8636975288391113, + "num_tokens": 210302929.0, + "step": 5515 + }, + { + "epoch": 0.7016918967052538, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5259733200073242, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8843374252319336, + "num_tokens": 210344903.0, + "step": 5516 + }, + { + "epoch": 0.7018191069838443, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5539710521697998, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8681702017784119, + "num_tokens": 210387670.0, + "step": 5517 + }, + { + "epoch": 0.7019463172624348, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5046693086624146, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8797858953475952, + "num_tokens": 210428809.0, + "step": 5518 + }, + { + "epoch": 0.7020735275410254, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6686029434204102, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8430726528167725, + "num_tokens": 210464467.0, + "step": 5519 + }, + { + "epoch": 0.7022007378196158, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5923789739608765, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8683314323425293, + "num_tokens": 210502971.0, + "step": 5520 + }, + { + "epoch": 0.7023279480982063, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6072872877120972, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8657821416854858, + "num_tokens": 210542593.0, + "step": 5521 + }, + { + "epoch": 0.7024551583767968, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5629581212997437, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.866851270198822, + "num_tokens": 210582123.0, + "step": 5522 + }, + { + "epoch": 0.7025823686553874, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5357649326324463, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8804201483726501, + "num_tokens": 210617702.0, + "step": 5523 + }, + { + "epoch": 0.7027095789339779, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6586353778839111, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8625706434249878, + "num_tokens": 210652723.0, + "step": 5524 + }, + { + "epoch": 0.7028367892125684, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6098275184631348, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8787068128585815, + "num_tokens": 210689891.0, + "step": 5525 + }, + { + "epoch": 0.7029639994911588, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.4963756799697876, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8642827272415161, + "num_tokens": 210735908.0, + "step": 5526 + }, + { + "epoch": 0.7030912097697494, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5965840816497803, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8819777965545654, + "num_tokens": 210777278.0, + "step": 5527 + }, + { + "epoch": 0.7032184200483399, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.531020998954773, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8748394846916199, + "num_tokens": 210815867.0, + "step": 5528 + }, + { + "epoch": 0.7033456303269304, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5971577167510986, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8609018325805664, + "num_tokens": 210856526.0, + "step": 5529 + }, + { + "epoch": 0.703472840605521, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.670194149017334, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8586055636405945, + "num_tokens": 210897077.0, + "step": 5530 + }, + { + "epoch": 0.7036000508841115, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6942943334579468, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8490155935287476, + "num_tokens": 210932076.0, + "step": 5531 + }, + { + "epoch": 0.7037272611627019, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5768475532531738, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8757789134979248, + "num_tokens": 210968446.0, + "step": 5532 + }, + { + "epoch": 0.7038544714412924, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5368949174880981, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8574589490890503, + "num_tokens": 211008821.0, + "step": 5533 + }, + { + "epoch": 0.703981681719883, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.661755084991455, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8612838983535767, + "num_tokens": 211044451.0, + "step": 5534 + }, + { + "epoch": 0.7041088919984735, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6732487678527832, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8734676837921143, + "num_tokens": 211078360.0, + "step": 5535 + }, + { + "epoch": 0.704236102277064, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5785421133041382, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8683724403381348, + "num_tokens": 211114297.0, + "step": 5536 + }, + { + "epoch": 0.7043633125556545, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.608246088027954, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8673074245452881, + "num_tokens": 211147443.0, + "step": 5537 + }, + { + "epoch": 0.704490522834245, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5529738664627075, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8699734210968018, + "num_tokens": 211187813.0, + "step": 5538 + }, + { + "epoch": 0.7046177331128355, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7428048849105835, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8527549505233765, + "num_tokens": 211226187.0, + "step": 5539 + }, + { + "epoch": 0.704744943391426, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5486470460891724, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8649507761001587, + "num_tokens": 211263553.0, + "step": 5540 + }, + { + "epoch": 0.7048721536700165, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7871146202087402, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8529664278030396, + "num_tokens": 211296768.0, + "step": 5541 + }, + { + "epoch": 0.7049993639486071, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6945576667785645, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8693875074386597, + "num_tokens": 211332212.0, + "step": 5542 + }, + { + "epoch": 0.7051265742271976, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5605560541152954, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8608669638633728, + "num_tokens": 211373490.0, + "step": 5543 + }, + { + "epoch": 0.705253784505788, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6361795663833618, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8718106746673584, + "num_tokens": 211411045.0, + "step": 5544 + }, + { + "epoch": 0.7053809947843785, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6231584548950195, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8759522438049316, + "num_tokens": 211444769.0, + "step": 5545 + }, + { + "epoch": 0.7055082050629691, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5334155559539795, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8818647265434265, + "num_tokens": 211482365.0, + "step": 5546 + }, + { + "epoch": 0.7056354153415596, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7341010570526123, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8693392276763916, + "num_tokens": 211515877.0, + "step": 5547 + }, + { + "epoch": 0.7057626256201501, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5470523834228516, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8569921255111694, + "num_tokens": 211557483.0, + "step": 5548 + }, + { + "epoch": 0.7058898358987407, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5984364748001099, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8613260984420776, + "num_tokens": 211594752.0, + "step": 5549 + }, + { + "epoch": 0.7060170461773311, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.47132408618927, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8667140007019043, + "num_tokens": 211636654.0, + "step": 5550 + }, + { + "epoch": 0.7061442564559216, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6821205615997314, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8615986108779907, + "num_tokens": 211676344.0, + "step": 5551 + }, + { + "epoch": 0.7062714667345121, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.611312985420227, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8493196368217468, + "num_tokens": 211712700.0, + "step": 5552 + }, + { + "epoch": 0.7063986770131027, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.737044334411621, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8664709329605103, + "num_tokens": 211747296.0, + "step": 5553 + }, + { + "epoch": 0.7065258872916932, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.4740653038024902, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.864406943321228, + "num_tokens": 211787010.0, + "step": 5554 + }, + { + "epoch": 0.7066530975702837, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7072670459747314, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8558298945426941, + "num_tokens": 211821604.0, + "step": 5555 + }, + { + "epoch": 0.7067803078488741, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6681747436523438, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8715856075286865, + "num_tokens": 211855311.0, + "step": 5556 + }, + { + "epoch": 0.7069075181274647, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6493451595306396, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8681815266609192, + "num_tokens": 211890530.0, + "step": 5557 + }, + { + "epoch": 0.7070347284060552, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6010693311691284, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8584088087081909, + "num_tokens": 211928580.0, + "step": 5558 + }, + { + "epoch": 0.7071619386846457, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.535031795501709, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.865447998046875, + "num_tokens": 211966172.0, + "step": 5559 + }, + { + "epoch": 0.7072891489632362, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.598684549331665, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8595607280731201, + "num_tokens": 212004915.0, + "step": 5560 + }, + { + "epoch": 0.7074163592418268, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5233063697814941, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.864192008972168, + "num_tokens": 212044576.0, + "step": 5561 + }, + { + "epoch": 0.7075435695204173, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.576314926147461, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8710286021232605, + "num_tokens": 212081409.0, + "step": 5562 + }, + { + "epoch": 0.7076707797990077, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.4528733491897583, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8786126375198364, + "num_tokens": 212119690.0, + "step": 5563 + }, + { + "epoch": 0.7077979900775982, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6149481534957886, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.865998387336731, + "num_tokens": 212155289.0, + "step": 5564 + }, + { + "epoch": 0.7079252003561888, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7360373735427856, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8605000972747803, + "num_tokens": 212189885.0, + "step": 5565 + }, + { + "epoch": 0.7080524106347793, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5681756734848022, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8607527017593384, + "num_tokens": 212225796.0, + "step": 5566 + }, + { + "epoch": 0.7081796209133698, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.63046133518219, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8515850305557251, + "num_tokens": 212262840.0, + "step": 5567 + }, + { + "epoch": 0.7083068311919604, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.657088279724121, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8645931482315063, + "num_tokens": 212296082.0, + "step": 5568 + }, + { + "epoch": 0.7084340414705508, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.693206548690796, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8464670777320862, + "num_tokens": 212331938.0, + "step": 5569 + }, + { + "epoch": 0.7085612517491413, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.637975811958313, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8636812567710876, + "num_tokens": 212369237.0, + "step": 5570 + }, + { + "epoch": 0.7086884620277318, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.5244626998901367, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8748993277549744, + "num_tokens": 212404228.0, + "step": 5571 + }, + { + "epoch": 0.7088156723063224, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5404870510101318, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.873234212398529, + "num_tokens": 212442182.0, + "step": 5572 + }, + { + "epoch": 0.7089428825849129, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7353636026382446, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8633636236190796, + "num_tokens": 212473500.0, + "step": 5573 + }, + { + "epoch": 0.7090700928635034, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.7970894575119019, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8419922590255737, + "num_tokens": 212509393.0, + "step": 5574 + }, + { + "epoch": 0.7091973031420938, + "ewc_loss": 1.6570091247558594e-05, + "grad_norm": 1.6313022375106812, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8625431060791016, + "num_tokens": 212547205.0, + "step": 5575 + }, + { + "epoch": 0.7093245134206844, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.607844591140747, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8582773804664612, + "num_tokens": 212583929.0, + "step": 5576 + }, + { + "epoch": 0.7094517236992749, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6785274744033813, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8595683574676514, + "num_tokens": 212620769.0, + "step": 5577 + }, + { + "epoch": 0.7095789339778654, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5823688507080078, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8465880751609802, + "num_tokens": 212661591.0, + "step": 5578 + }, + { + "epoch": 0.709706144256456, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.8182566165924072, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8507593274116516, + "num_tokens": 212700272.0, + "step": 5579 + }, + { + "epoch": 0.7098333545350465, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4931540489196777, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8617279529571533, + "num_tokens": 212744895.0, + "step": 5580 + }, + { + "epoch": 0.7099605648136369, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5117729902267456, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8736302256584167, + "num_tokens": 212781639.0, + "step": 5581 + }, + { + "epoch": 0.7100877750922274, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6742793321609497, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8472497463226318, + "num_tokens": 212816850.0, + "step": 5582 + }, + { + "epoch": 0.710214985370818, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6026780605316162, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8660143613815308, + "num_tokens": 212852715.0, + "step": 5583 + }, + { + "epoch": 0.7103421956494085, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4166820049285889, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8679360151290894, + "num_tokens": 212894231.0, + "step": 5584 + }, + { + "epoch": 0.710469405927999, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5882164239883423, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8715969324111938, + "num_tokens": 212929678.0, + "step": 5585 + }, + { + "epoch": 0.7105966162065895, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6085768938064575, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8671057224273682, + "num_tokens": 212969165.0, + "step": 5586 + }, + { + "epoch": 0.71072382648518, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5468686819076538, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8673204779624939, + "num_tokens": 213009523.0, + "step": 5587 + }, + { + "epoch": 0.7108510367637705, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4442323446273804, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.868897020816803, + "num_tokens": 213050138.0, + "step": 5588 + }, + { + "epoch": 0.710978247042361, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4898202419281006, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8781804442405701, + "num_tokens": 213090200.0, + "step": 5589 + }, + { + "epoch": 0.7111054573209515, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.7209842205047607, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8545745611190796, + "num_tokens": 213127845.0, + "step": 5590 + }, + { + "epoch": 0.7112326675995421, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.627873420715332, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8354710340499878, + "num_tokens": 213168420.0, + "step": 5591 + }, + { + "epoch": 0.7113598778781326, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5985972881317139, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8697402477264404, + "num_tokens": 213207803.0, + "step": 5592 + }, + { + "epoch": 0.711487088156723, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 2.1443662643432617, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8625312447547913, + "num_tokens": 213250020.0, + "step": 5593 + }, + { + "epoch": 0.7116142984353135, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5895638465881348, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8592883348464966, + "num_tokens": 213288090.0, + "step": 5594 + }, + { + "epoch": 0.7117415087139041, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6838641166687012, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8679723143577576, + "num_tokens": 213325548.0, + "step": 5595 + }, + { + "epoch": 0.7118687189924946, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.8381726741790771, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8551446199417114, + "num_tokens": 213355479.0, + "step": 5596 + }, + { + "epoch": 0.7119959292710851, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5908751487731934, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.850222110748291, + "num_tokens": 213397224.0, + "step": 5597 + }, + { + "epoch": 0.7121231395496757, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6310665607452393, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8621155619621277, + "num_tokens": 213434460.0, + "step": 5598 + }, + { + "epoch": 0.7122503498282661, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6323014497756958, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8599734306335449, + "num_tokens": 213471750.0, + "step": 5599 + }, + { + "epoch": 0.7123775601068566, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5977236032485962, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8693441152572632, + "num_tokens": 213508870.0, + "step": 5600 + }, + { + "epoch": 0.7125047703854471, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4921984672546387, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8584813475608826, + "num_tokens": 213552345.0, + "step": 5601 + }, + { + "epoch": 0.7126319806640377, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.8896814584732056, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8539228439331055, + "num_tokens": 213584939.0, + "step": 5602 + }, + { + "epoch": 0.7127591909426282, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.567250370979309, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.854449987411499, + "num_tokens": 213627538.0, + "step": 5603 + }, + { + "epoch": 0.7128864012212187, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6816811561584473, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8569111824035645, + "num_tokens": 213669977.0, + "step": 5604 + }, + { + "epoch": 0.7130136114998091, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.606918454170227, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8671422004699707, + "num_tokens": 213708807.0, + "step": 5605 + }, + { + "epoch": 0.7131408217783997, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.615926742553711, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8553935289382935, + "num_tokens": 213747475.0, + "step": 5606 + }, + { + "epoch": 0.7132680320569902, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6366225481033325, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8598611354827881, + "num_tokens": 213785118.0, + "step": 5607 + }, + { + "epoch": 0.7133952423355807, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5641655921936035, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8621813058853149, + "num_tokens": 213827466.0, + "step": 5608 + }, + { + "epoch": 0.7135224526141712, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5571599006652832, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8592537641525269, + "num_tokens": 213868975.0, + "step": 5609 + }, + { + "epoch": 0.7136496628927618, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.375033974647522, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8680586814880371, + "num_tokens": 213912756.0, + "step": 5610 + }, + { + "epoch": 0.7137768731713523, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5369908809661865, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8530161380767822, + "num_tokens": 213951862.0, + "step": 5611 + }, + { + "epoch": 0.7139040834499427, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4268733263015747, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8712463974952698, + "num_tokens": 213992674.0, + "step": 5612 + }, + { + "epoch": 0.7140312937285332, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6131738424301147, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8651742935180664, + "num_tokens": 214027797.0, + "step": 5613 + }, + { + "epoch": 0.7141585040071238, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6258660554885864, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8516613245010376, + "num_tokens": 214065305.0, + "step": 5614 + }, + { + "epoch": 0.7142857142857143, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.718570351600647, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8494157195091248, + "num_tokens": 214099841.0, + "step": 5615 + }, + { + "epoch": 0.7144129245643048, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5578914880752563, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8580561280250549, + "num_tokens": 214141962.0, + "step": 5616 + }, + { + "epoch": 0.7145401348428954, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.7026088237762451, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.865984320640564, + "num_tokens": 214176471.0, + "step": 5617 + }, + { + "epoch": 0.7146673451214858, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6406763792037964, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8509727120399475, + "num_tokens": 214216349.0, + "step": 5618 + }, + { + "epoch": 0.7147945554000763, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.683555245399475, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8622255325317383, + "num_tokens": 214254597.0, + "step": 5619 + }, + { + "epoch": 0.7149217656786668, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6234402656555176, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8731145262718201, + "num_tokens": 214290250.0, + "step": 5620 + }, + { + "epoch": 0.7150489759572574, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6641854047775269, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8568743467330933, + "num_tokens": 214323683.0, + "step": 5621 + }, + { + "epoch": 0.7151761862358479, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5621310472488403, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8628522157669067, + "num_tokens": 214363393.0, + "step": 5622 + }, + { + "epoch": 0.7153033965144384, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.50871741771698, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8582116961479187, + "num_tokens": 214406433.0, + "step": 5623 + }, + { + "epoch": 0.7154306067930288, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.789536952972412, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8570269346237183, + "num_tokens": 214437110.0, + "step": 5624 + }, + { + "epoch": 0.7155578170716194, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5313223600387573, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8690872192382812, + "num_tokens": 214474665.0, + "step": 5625 + }, + { + "epoch": 0.7156850273502099, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5969512462615967, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8475148677825928, + "num_tokens": 214517593.0, + "step": 5626 + }, + { + "epoch": 0.7158122376288004, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.496842384338379, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8712480068206787, + "num_tokens": 214560342.0, + "step": 5627 + }, + { + "epoch": 0.715939447907391, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6271564960479736, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8640686869621277, + "num_tokens": 214599846.0, + "step": 5628 + }, + { + "epoch": 0.7160666581859815, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.586260199546814, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8516324758529663, + "num_tokens": 214640098.0, + "step": 5629 + }, + { + "epoch": 0.7161938684645719, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6103562116622925, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8611469268798828, + "num_tokens": 214679542.0, + "step": 5630 + }, + { + "epoch": 0.7163210787431624, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.616461157798767, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8577797412872314, + "num_tokens": 214718974.0, + "step": 5631 + }, + { + "epoch": 0.716448289021753, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6411728858947754, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8647294044494629, + "num_tokens": 214757147.0, + "step": 5632 + }, + { + "epoch": 0.7165754993003435, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5899450778961182, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8659921884536743, + "num_tokens": 214796978.0, + "step": 5633 + }, + { + "epoch": 0.716702709578934, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5946263074874878, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.871394157409668, + "num_tokens": 214833489.0, + "step": 5634 + }, + { + "epoch": 0.7168299198575245, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4684151411056519, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.856338620185852, + "num_tokens": 214880001.0, + "step": 5635 + }, + { + "epoch": 0.716957130136115, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5227165222167969, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8723230957984924, + "num_tokens": 214920439.0, + "step": 5636 + }, + { + "epoch": 0.7170843404147055, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5564998388290405, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8663727045059204, + "num_tokens": 214957722.0, + "step": 5637 + }, + { + "epoch": 0.717211550693296, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6646775007247925, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8516097664833069, + "num_tokens": 214994070.0, + "step": 5638 + }, + { + "epoch": 0.7173387609718865, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.555147647857666, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8748918771743774, + "num_tokens": 215029023.0, + "step": 5639 + }, + { + "epoch": 0.7174659712504771, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5508434772491455, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8567888140678406, + "num_tokens": 215070210.0, + "step": 5640 + }, + { + "epoch": 0.7175931815290676, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6239776611328125, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8636500239372253, + "num_tokens": 215105961.0, + "step": 5641 + }, + { + "epoch": 0.717720391807658, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4691437482833862, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8548077344894409, + "num_tokens": 215149157.0, + "step": 5642 + }, + { + "epoch": 0.7178476020862485, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.616328239440918, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8683878183364868, + "num_tokens": 215185993.0, + "step": 5643 + }, + { + "epoch": 0.7179748123648391, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5413798093795776, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8620588779449463, + "num_tokens": 215226261.0, + "step": 5644 + }, + { + "epoch": 0.7181020226434296, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6537636518478394, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8514107465744019, + "num_tokens": 215260528.0, + "step": 5645 + }, + { + "epoch": 0.7182292329220201, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5579594373703003, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8657792806625366, + "num_tokens": 215298107.0, + "step": 5646 + }, + { + "epoch": 0.7183564432006107, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.599782943725586, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8566646575927734, + "num_tokens": 215339026.0, + "step": 5647 + }, + { + "epoch": 0.7184836534792011, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.8066233396530151, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8400324583053589, + "num_tokens": 215375044.0, + "step": 5648 + }, + { + "epoch": 0.7186108637577916, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5435084104537964, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8693657517433167, + "num_tokens": 215413039.0, + "step": 5649 + }, + { + "epoch": 0.7187380740363821, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5659401416778564, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.864149808883667, + "num_tokens": 215448927.0, + "step": 5650 + }, + { + "epoch": 0.7188652843149727, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.7395013570785522, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8740234375, + "num_tokens": 215479413.0, + "step": 5651 + }, + { + "epoch": 0.7189924945935632, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.8553849458694458, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8714216947555542, + "num_tokens": 215509739.0, + "step": 5652 + }, + { + "epoch": 0.7191197048721537, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.8635494709014893, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.863876461982727, + "num_tokens": 215543438.0, + "step": 5653 + }, + { + "epoch": 0.7192469151507441, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5910861492156982, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8602127432823181, + "num_tokens": 215579608.0, + "step": 5654 + }, + { + "epoch": 0.7193741254293347, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6496244668960571, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8830208778381348, + "num_tokens": 215612608.0, + "step": 5655 + }, + { + "epoch": 0.7195013357079252, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 3.7468907833099365, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8632701635360718, + "num_tokens": 215652497.0, + "step": 5656 + }, + { + "epoch": 0.7196285459865157, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6559725999832153, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8626872897148132, + "num_tokens": 215686821.0, + "step": 5657 + }, + { + "epoch": 0.7197557562651062, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5916740894317627, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8653426766395569, + "num_tokens": 215728615.0, + "step": 5658 + }, + { + "epoch": 0.7198829665436968, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5466078519821167, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8668040037155151, + "num_tokens": 215767911.0, + "step": 5659 + }, + { + "epoch": 0.7200101768222873, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4058812856674194, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8823581337928772, + "num_tokens": 215810717.0, + "step": 5660 + }, + { + "epoch": 0.7201373871008777, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.602478265762329, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8601689338684082, + "num_tokens": 215851515.0, + "step": 5661 + }, + { + "epoch": 0.7202645973794682, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6857367753982544, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8553565740585327, + "num_tokens": 215888425.0, + "step": 5662 + }, + { + "epoch": 0.7203918076580588, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6153786182403564, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8549730777740479, + "num_tokens": 215925442.0, + "step": 5663 + }, + { + "epoch": 0.7205190179366493, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6018146276474, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.870094358921051, + "num_tokens": 215958139.0, + "step": 5664 + }, + { + "epoch": 0.7206462282152398, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4888436794281006, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8648384809494019, + "num_tokens": 215996153.0, + "step": 5665 + }, + { + "epoch": 0.7207734384938304, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6996769905090332, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8413596153259277, + "num_tokens": 216035882.0, + "step": 5666 + }, + { + "epoch": 0.7209006487724208, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.495339035987854, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8701180219650269, + "num_tokens": 216075554.0, + "step": 5667 + }, + { + "epoch": 0.7210278590510113, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5434727668762207, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8647773265838623, + "num_tokens": 216111300.0, + "step": 5668 + }, + { + "epoch": 0.7211550693296018, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5358613729476929, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8698250651359558, + "num_tokens": 216150374.0, + "step": 5669 + }, + { + "epoch": 0.7212822796081924, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.7268677949905396, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8571327924728394, + "num_tokens": 216184560.0, + "step": 5670 + }, + { + "epoch": 0.7214094898867829, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5170888900756836, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8852826356887817, + "num_tokens": 216220741.0, + "step": 5671 + }, + { + "epoch": 0.7215367001653734, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6002188920974731, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8647875189781189, + "num_tokens": 216257685.0, + "step": 5672 + }, + { + "epoch": 0.7216639104439638, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.548120141029358, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.865149736404419, + "num_tokens": 216294331.0, + "step": 5673 + }, + { + "epoch": 0.7217911207225544, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.555097222328186, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8638461232185364, + "num_tokens": 216334085.0, + "step": 5674 + }, + { + "epoch": 0.7219183310011449, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.530554175376892, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8661714792251587, + "num_tokens": 216372356.0, + "step": 5675 + }, + { + "epoch": 0.7220455412797354, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5575073957443237, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8578802347183228, + "num_tokens": 216412685.0, + "step": 5676 + }, + { + "epoch": 0.7221727515583259, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6061992645263672, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8757352828979492, + "num_tokens": 216448171.0, + "step": 5677 + }, + { + "epoch": 0.7222999618369165, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.4474542140960693, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8893319368362427, + "num_tokens": 216488299.0, + "step": 5678 + }, + { + "epoch": 0.7224271721155069, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.6434930562973022, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8507512807846069, + "num_tokens": 216527894.0, + "step": 5679 + }, + { + "epoch": 0.7225543823940974, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5942895412445068, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8707920908927917, + "num_tokens": 216563903.0, + "step": 5680 + }, + { + "epoch": 0.722681592672688, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5963194370269775, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8625736236572266, + "num_tokens": 216601263.0, + "step": 5681 + }, + { + "epoch": 0.7228088029512785, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.657686471939087, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8506098985671997, + "num_tokens": 216640363.0, + "step": 5682 + }, + { + "epoch": 0.722936013229869, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.584477424621582, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8690632581710815, + "num_tokens": 216677543.0, + "step": 5683 + }, + { + "epoch": 0.7230632235084595, + "ewc_loss": 1.6689300537109375e-05, + "grad_norm": 1.5427606105804443, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8465440273284912, + "num_tokens": 216718893.0, + "step": 5684 + }, + { + "epoch": 0.72319043378705, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.586275339126587, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8623934388160706, + "num_tokens": 216755290.0, + "step": 5685 + }, + { + "epoch": 0.7233176440656405, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5948299169540405, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8596721887588501, + "num_tokens": 216792401.0, + "step": 5686 + }, + { + "epoch": 0.723444854344231, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6705759763717651, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8593946695327759, + "num_tokens": 216825198.0, + "step": 5687 + }, + { + "epoch": 0.7235720646228215, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5541658401489258, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8589333295822144, + "num_tokens": 216864312.0, + "step": 5688 + }, + { + "epoch": 0.7236992749014121, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4724879264831543, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8704107999801636, + "num_tokens": 216907152.0, + "step": 5689 + }, + { + "epoch": 0.7238264851800026, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5847628116607666, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8774137496948242, + "num_tokens": 216942406.0, + "step": 5690 + }, + { + "epoch": 0.723953695458593, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5720810890197754, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8650162816047668, + "num_tokens": 216981306.0, + "step": 5691 + }, + { + "epoch": 0.7240809057371835, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5626786947250366, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8519090414047241, + "num_tokens": 217021382.0, + "step": 5692 + }, + { + "epoch": 0.7242081160157741, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6040663719177246, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8615748286247253, + "num_tokens": 217058342.0, + "step": 5693 + }, + { + "epoch": 0.7243353262943646, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.49118971824646, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8571960926055908, + "num_tokens": 217102741.0, + "step": 5694 + }, + { + "epoch": 0.7244625365729551, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6198649406433105, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8591939210891724, + "num_tokens": 217138178.0, + "step": 5695 + }, + { + "epoch": 0.7245897468515456, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4606173038482666, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8680863976478577, + "num_tokens": 217181428.0, + "step": 5696 + }, + { + "epoch": 0.7247169571301361, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5526506900787354, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8666137456893921, + "num_tokens": 217221278.0, + "step": 5697 + }, + { + "epoch": 0.7248441674087266, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.504912257194519, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8696275353431702, + "num_tokens": 217261954.0, + "step": 5698 + }, + { + "epoch": 0.7249713776873171, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.544596552848816, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8566640019416809, + "num_tokens": 217301555.0, + "step": 5699 + }, + { + "epoch": 0.7250985879659076, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6331896781921387, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8481996655464172, + "num_tokens": 217339446.0, + "step": 5700 + }, + { + "epoch": 0.7252257982444982, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7466634511947632, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8459319472312927, + "num_tokens": 217374137.0, + "step": 5701 + }, + { + "epoch": 0.7253530085230887, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5078414678573608, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.850548505783081, + "num_tokens": 217414576.0, + "step": 5702 + }, + { + "epoch": 0.7254802188016791, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.488824486732483, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8681799173355103, + "num_tokens": 217454724.0, + "step": 5703 + }, + { + "epoch": 0.7256074290802697, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6287764310836792, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8473365306854248, + "num_tokens": 217490567.0, + "step": 5704 + }, + { + "epoch": 0.7257346393588602, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4349267482757568, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.858052134513855, + "num_tokens": 217535568.0, + "step": 5705 + }, + { + "epoch": 0.7258618496374507, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5073267221450806, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8571155667304993, + "num_tokens": 217575425.0, + "step": 5706 + }, + { + "epoch": 0.7259890599160412, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6268776655197144, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8696934580802917, + "num_tokens": 217608469.0, + "step": 5707 + }, + { + "epoch": 0.7261162701946318, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4830046892166138, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8556795120239258, + "num_tokens": 217652775.0, + "step": 5708 + }, + { + "epoch": 0.7262434804732223, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6271663904190063, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8590503931045532, + "num_tokens": 217693482.0, + "step": 5709 + }, + { + "epoch": 0.7263706907518127, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.2961344718933105, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8755816221237183, + "num_tokens": 217725720.0, + "step": 5710 + }, + { + "epoch": 0.7264979010304032, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6416505575180054, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8623900413513184, + "num_tokens": 217761261.0, + "step": 5711 + }, + { + "epoch": 0.7266251113089938, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5924347639083862, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8534093499183655, + "num_tokens": 217802781.0, + "step": 5712 + }, + { + "epoch": 0.7267523215875843, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.517206072807312, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8678876161575317, + "num_tokens": 217843681.0, + "step": 5713 + }, + { + "epoch": 0.7268795318661748, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.556808352470398, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8630017042160034, + "num_tokens": 217884141.0, + "step": 5714 + }, + { + "epoch": 0.7270067421447653, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7621140480041504, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8610576391220093, + "num_tokens": 217917916.0, + "step": 5715 + }, + { + "epoch": 0.7271339524233558, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7982221841812134, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.855518102645874, + "num_tokens": 217948539.0, + "step": 5716 + }, + { + "epoch": 0.7272611627019463, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.682194709777832, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8553766012191772, + "num_tokens": 217988314.0, + "step": 5717 + }, + { + "epoch": 0.7273883729805368, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.658345341682434, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8536075949668884, + "num_tokens": 218027202.0, + "step": 5718 + }, + { + "epoch": 0.7275155832591274, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5080116987228394, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8762422800064087, + "num_tokens": 218064808.0, + "step": 5719 + }, + { + "epoch": 0.7276427935377179, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5580872297286987, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8520835638046265, + "num_tokens": 218107525.0, + "step": 5720 + }, + { + "epoch": 0.7277700038163084, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6046468019485474, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8654802441596985, + "num_tokens": 218146574.0, + "step": 5721 + }, + { + "epoch": 0.7278972140948988, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5654492378234863, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.858077883720398, + "num_tokens": 218183970.0, + "step": 5722 + }, + { + "epoch": 0.7280244243734894, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6199562549591064, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8699069023132324, + "num_tokens": 218216542.0, + "step": 5723 + }, + { + "epoch": 0.7281516346520799, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4425349235534668, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8745953440666199, + "num_tokens": 218260573.0, + "step": 5724 + }, + { + "epoch": 0.7282788449306704, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5688756704330444, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8701391816139221, + "num_tokens": 218299887.0, + "step": 5725 + }, + { + "epoch": 0.7284060552092609, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.9519431591033936, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8462280035018921, + "num_tokens": 218335617.0, + "step": 5726 + }, + { + "epoch": 0.7285332654878515, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6441051959991455, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8546624183654785, + "num_tokens": 218371762.0, + "step": 5727 + }, + { + "epoch": 0.7286604757664419, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.586553931236267, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8547836542129517, + "num_tokens": 218412841.0, + "step": 5728 + }, + { + "epoch": 0.7287876860450324, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6757488250732422, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8737795352935791, + "num_tokens": 218450234.0, + "step": 5729 + }, + { + "epoch": 0.7289148963236229, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.8362497091293335, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8582817316055298, + "num_tokens": 218482198.0, + "step": 5730 + }, + { + "epoch": 0.7290421066022135, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.752092719078064, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.860077977180481, + "num_tokens": 218522153.0, + "step": 5731 + }, + { + "epoch": 0.729169316880804, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7355868816375732, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8429608345031738, + "num_tokens": 218561128.0, + "step": 5732 + }, + { + "epoch": 0.7292965271593945, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.62934410572052, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8582723736763, + "num_tokens": 218600169.0, + "step": 5733 + }, + { + "epoch": 0.7294237374379849, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5859887599945068, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8790410161018372, + "num_tokens": 218640737.0, + "step": 5734 + }, + { + "epoch": 0.7295509477165755, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.8379756212234497, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8564835786819458, + "num_tokens": 218676999.0, + "step": 5735 + }, + { + "epoch": 0.729678157995166, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4366511106491089, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8640143871307373, + "num_tokens": 218721324.0, + "step": 5736 + }, + { + "epoch": 0.7298053682737565, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4426050186157227, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.872618556022644, + "num_tokens": 218763148.0, + "step": 5737 + }, + { + "epoch": 0.7299325785523471, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.740336298942566, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8690675497055054, + "num_tokens": 218798222.0, + "step": 5738 + }, + { + "epoch": 0.7300597888309376, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7633870840072632, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8685218691825867, + "num_tokens": 218831597.0, + "step": 5739 + }, + { + "epoch": 0.730186999109528, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5536257028579712, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8696633577346802, + "num_tokens": 218869872.0, + "step": 5740 + }, + { + "epoch": 0.7303142093881185, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6235638856887817, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8625890016555786, + "num_tokens": 218906717.0, + "step": 5741 + }, + { + "epoch": 0.7304414196667091, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5247986316680908, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8567591905593872, + "num_tokens": 218952199.0, + "step": 5742 + }, + { + "epoch": 0.7305686299452996, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5718234777450562, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8567858338356018, + "num_tokens": 218999650.0, + "step": 5743 + }, + { + "epoch": 0.7306958402238901, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6887308359146118, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8501132726669312, + "num_tokens": 219039398.0, + "step": 5744 + }, + { + "epoch": 0.7308230505024806, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5939937829971313, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8645627498626709, + "num_tokens": 219080152.0, + "step": 5745 + }, + { + "epoch": 0.7309502607810711, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.3890063762664795, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8644088506698608, + "num_tokens": 219127290.0, + "step": 5746 + }, + { + "epoch": 0.7310774710596616, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7174113988876343, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8596484661102295, + "num_tokens": 219159056.0, + "step": 5747 + }, + { + "epoch": 0.7312046813382521, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4716286659240723, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8492498397827148, + "num_tokens": 219202280.0, + "step": 5748 + }, + { + "epoch": 0.7313318916168426, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5624020099639893, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8664499521255493, + "num_tokens": 219240230.0, + "step": 5749 + }, + { + "epoch": 0.7314591018954332, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5838416814804077, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8716384172439575, + "num_tokens": 219279145.0, + "step": 5750 + }, + { + "epoch": 0.7315863121740237, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6851366758346558, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8786463737487793, + "num_tokens": 219314705.0, + "step": 5751 + }, + { + "epoch": 0.7317135224526141, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4364125728607178, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8694804906845093, + "num_tokens": 219359702.0, + "step": 5752 + }, + { + "epoch": 0.7318407327312046, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5847196578979492, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8673304915428162, + "num_tokens": 219396309.0, + "step": 5753 + }, + { + "epoch": 0.7319679430097952, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6517307758331299, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8564945459365845, + "num_tokens": 219432775.0, + "step": 5754 + }, + { + "epoch": 0.7320951532883857, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6672618389129639, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8601665496826172, + "num_tokens": 219472331.0, + "step": 5755 + }, + { + "epoch": 0.7322223635669762, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.666788935661316, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8619502782821655, + "num_tokens": 219506175.0, + "step": 5756 + }, + { + "epoch": 0.7323495738455668, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5149928331375122, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8713511824607849, + "num_tokens": 219546607.0, + "step": 5757 + }, + { + "epoch": 0.7324767841241572, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5300023555755615, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8627516627311707, + "num_tokens": 219587159.0, + "step": 5758 + }, + { + "epoch": 0.7326039944027477, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.491127848625183, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8637112379074097, + "num_tokens": 219632597.0, + "step": 5759 + }, + { + "epoch": 0.7327312046813382, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.735336422920227, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8675699830055237, + "num_tokens": 219663837.0, + "step": 5760 + }, + { + "epoch": 0.7328584149599288, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5055733919143677, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8540518283843994, + "num_tokens": 219706252.0, + "step": 5761 + }, + { + "epoch": 0.7329856252385193, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 2.197291612625122, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8631402850151062, + "num_tokens": 219747180.0, + "step": 5762 + }, + { + "epoch": 0.7331128355171098, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6196422576904297, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8592986464500427, + "num_tokens": 219786283.0, + "step": 5763 + }, + { + "epoch": 0.7332400457957003, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5835912227630615, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8694711327552795, + "num_tokens": 219821097.0, + "step": 5764 + }, + { + "epoch": 0.7333672560742908, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.630263090133667, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8445143699645996, + "num_tokens": 219861901.0, + "step": 5765 + }, + { + "epoch": 0.7334944663528813, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5683515071868896, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8607087135314941, + "num_tokens": 219903929.0, + "step": 5766 + }, + { + "epoch": 0.7336216766314718, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6809178590774536, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8535515069961548, + "num_tokens": 219940992.0, + "step": 5767 + }, + { + "epoch": 0.7337488869100623, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7094156742095947, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8686090707778931, + "num_tokens": 219973930.0, + "step": 5768 + }, + { + "epoch": 0.7338760971886529, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6843966245651245, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8621870279312134, + "num_tokens": 220008156.0, + "step": 5769 + }, + { + "epoch": 0.7340033074672434, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5785435438156128, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8635337948799133, + "num_tokens": 220046238.0, + "step": 5770 + }, + { + "epoch": 0.7341305177458338, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6110683679580688, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8613852262496948, + "num_tokens": 220088080.0, + "step": 5771 + }, + { + "epoch": 0.7342577280244243, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5531070232391357, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8724299669265747, + "num_tokens": 220127697.0, + "step": 5772 + }, + { + "epoch": 0.7343849383030149, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.8445475101470947, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8658223152160645, + "num_tokens": 220161385.0, + "step": 5773 + }, + { + "epoch": 0.7345121485816054, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6123430728912354, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.859419584274292, + "num_tokens": 220200288.0, + "step": 5774 + }, + { + "epoch": 0.7346393588601959, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5799212455749512, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8491289019584656, + "num_tokens": 220244327.0, + "step": 5775 + }, + { + "epoch": 0.7347665691387865, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5839084386825562, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8810926079750061, + "num_tokens": 220277477.0, + "step": 5776 + }, + { + "epoch": 0.7348937794173769, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.8089174032211304, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8604836463928223, + "num_tokens": 220312377.0, + "step": 5777 + }, + { + "epoch": 0.7350209896959674, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7492800951004028, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8652964234352112, + "num_tokens": 220345451.0, + "step": 5778 + }, + { + "epoch": 0.7351481999745579, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6101990938186646, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8678785562515259, + "num_tokens": 220389378.0, + "step": 5779 + }, + { + "epoch": 0.7352754102531485, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6113085746765137, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8515615463256836, + "num_tokens": 220429814.0, + "step": 5780 + }, + { + "epoch": 0.735402620531739, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7844833135604858, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8588906526565552, + "num_tokens": 220461881.0, + "step": 5781 + }, + { + "epoch": 0.7355298308103295, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7930647134780884, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8486045598983765, + "num_tokens": 220498948.0, + "step": 5782 + }, + { + "epoch": 0.7356570410889199, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.652764916419983, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8467279672622681, + "num_tokens": 220536765.0, + "step": 5783 + }, + { + "epoch": 0.7357842513675105, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5704787969589233, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8540047407150269, + "num_tokens": 220580916.0, + "step": 5784 + }, + { + "epoch": 0.735911461646101, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6112746000289917, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8612993359565735, + "num_tokens": 220615148.0, + "step": 5785 + }, + { + "epoch": 0.7360386719246915, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5256670713424683, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.858690619468689, + "num_tokens": 220656471.0, + "step": 5786 + }, + { + "epoch": 0.736165882203282, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5564897060394287, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8532861471176147, + "num_tokens": 220698930.0, + "step": 5787 + }, + { + "epoch": 0.7362930924818726, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.742531418800354, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8642459511756897, + "num_tokens": 220733780.0, + "step": 5788 + }, + { + "epoch": 0.736420302760463, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4908497333526611, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8695581555366516, + "num_tokens": 220774631.0, + "step": 5789 + }, + { + "epoch": 0.7365475130390535, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.480018973350525, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8546322584152222, + "num_tokens": 220817969.0, + "step": 5790 + }, + { + "epoch": 0.736674723317644, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.608306646347046, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8494465351104736, + "num_tokens": 220856750.0, + "step": 5791 + }, + { + "epoch": 0.7368019335962346, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6669505834579468, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8548368811607361, + "num_tokens": 220892039.0, + "step": 5792 + }, + { + "epoch": 0.7369291438748251, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.563706636428833, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.875198483467102, + "num_tokens": 220928940.0, + "step": 5793 + }, + { + "epoch": 0.7370563541534156, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6324621438980103, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8424057364463806, + "num_tokens": 220970320.0, + "step": 5794 + }, + { + "epoch": 0.737183564432006, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7283709049224854, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8537328243255615, + "num_tokens": 221007207.0, + "step": 5795 + }, + { + "epoch": 0.7373107747105966, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.647746205329895, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8700082302093506, + "num_tokens": 221044105.0, + "step": 5796 + }, + { + "epoch": 0.7374379849891871, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5356217622756958, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8626478910446167, + "num_tokens": 221084226.0, + "step": 5797 + }, + { + "epoch": 0.7375651952677776, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.498538851737976, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8555232286453247, + "num_tokens": 221125501.0, + "step": 5798 + }, + { + "epoch": 0.7376924055463682, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4710620641708374, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8638902902603149, + "num_tokens": 221167208.0, + "step": 5799 + }, + { + "epoch": 0.7378196158249587, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.9300113916397095, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8494547605514526, + "num_tokens": 221203005.0, + "step": 5800 + }, + { + "epoch": 0.7379468261035491, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5077906847000122, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8736685514450073, + "num_tokens": 221242624.0, + "step": 5801 + }, + { + "epoch": 0.7380740363821396, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5716071128845215, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8595412969589233, + "num_tokens": 221281889.0, + "step": 5802 + }, + { + "epoch": 0.7382012466607302, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6025683879852295, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8595335483551025, + "num_tokens": 221321734.0, + "step": 5803 + }, + { + "epoch": 0.7383284569393207, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.503204107284546, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.874230146408081, + "num_tokens": 221363542.0, + "step": 5804 + }, + { + "epoch": 0.7384556672179112, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4635220766067505, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8659493327140808, + "num_tokens": 221403654.0, + "step": 5805 + }, + { + "epoch": 0.7385828774965018, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5359880924224854, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8806865215301514, + "num_tokens": 221441140.0, + "step": 5806 + }, + { + "epoch": 0.7387100877750922, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4565279483795166, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8529305458068848, + "num_tokens": 221485826.0, + "step": 5807 + }, + { + "epoch": 0.7388372980536827, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6537123918533325, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8616750836372375, + "num_tokens": 221524556.0, + "step": 5808 + }, + { + "epoch": 0.7389645083322732, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5712405443191528, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8712413311004639, + "num_tokens": 221560568.0, + "step": 5809 + }, + { + "epoch": 0.7390917186108638, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6874639987945557, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8531985282897949, + "num_tokens": 221599998.0, + "step": 5810 + }, + { + "epoch": 0.7392189288894543, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6987446546554565, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8495007753372192, + "num_tokens": 221636430.0, + "step": 5811 + }, + { + "epoch": 0.7393461391680448, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.9498862028121948, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8564506769180298, + "num_tokens": 221665263.0, + "step": 5812 + }, + { + "epoch": 0.7394733494466353, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6198301315307617, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8653119802474976, + "num_tokens": 221703266.0, + "step": 5813 + }, + { + "epoch": 0.7396005597252258, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5627095699310303, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.873400092124939, + "num_tokens": 221741620.0, + "step": 5814 + }, + { + "epoch": 0.7397277700038163, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7261122465133667, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8454915881156921, + "num_tokens": 221780329.0, + "step": 5815 + }, + { + "epoch": 0.7398549802824068, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5708048343658447, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8592857122421265, + "num_tokens": 221821456.0, + "step": 5816 + }, + { + "epoch": 0.7399821905609973, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.664582371711731, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8515496253967285, + "num_tokens": 221860562.0, + "step": 5817 + }, + { + "epoch": 0.7401094008395879, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5571345090866089, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8637640476226807, + "num_tokens": 221896631.0, + "step": 5818 + }, + { + "epoch": 0.7402366111181784, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.624721884727478, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8599782586097717, + "num_tokens": 221939323.0, + "step": 5819 + }, + { + "epoch": 0.7403638213967688, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.533889889717102, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8748681545257568, + "num_tokens": 221975991.0, + "step": 5820 + }, + { + "epoch": 0.7404910316753593, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6928104162216187, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8814281225204468, + "num_tokens": 222008665.0, + "step": 5821 + }, + { + "epoch": 0.7406182419539499, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5392377376556396, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8619008660316467, + "num_tokens": 222045872.0, + "step": 5822 + }, + { + "epoch": 0.7407454522325404, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.461868166923523, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8681134581565857, + "num_tokens": 222084764.0, + "step": 5823 + }, + { + "epoch": 0.7408726625111309, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6106680631637573, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8722164630889893, + "num_tokens": 222121331.0, + "step": 5824 + }, + { + "epoch": 0.7409998727897215, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4566404819488525, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8509858846664429, + "num_tokens": 222167198.0, + "step": 5825 + }, + { + "epoch": 0.7411270830683119, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6338521242141724, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8568333387374878, + "num_tokens": 222201761.0, + "step": 5826 + }, + { + "epoch": 0.7412542933469024, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5486366748809814, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8612812757492065, + "num_tokens": 222243579.0, + "step": 5827 + }, + { + "epoch": 0.7413815036254929, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6510576009750366, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8582513332366943, + "num_tokens": 222283868.0, + "step": 5828 + }, + { + "epoch": 0.7415087139040835, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5052456855773926, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8865826725959778, + "num_tokens": 222318300.0, + "step": 5829 + }, + { + "epoch": 0.741635924182674, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.521384358406067, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.866818904876709, + "num_tokens": 222355656.0, + "step": 5830 + }, + { + "epoch": 0.7417631344612645, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.773262619972229, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8580523729324341, + "num_tokens": 222389514.0, + "step": 5831 + }, + { + "epoch": 0.7418903447398549, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.527439832687378, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8563594818115234, + "num_tokens": 222430738.0, + "step": 5832 + }, + { + "epoch": 0.7420175550184455, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4835673570632935, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8733463287353516, + "num_tokens": 222472224.0, + "step": 5833 + }, + { + "epoch": 0.742144765297036, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6306592226028442, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8549742102622986, + "num_tokens": 222509674.0, + "step": 5834 + }, + { + "epoch": 0.7422719755756265, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5787537097930908, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8718321919441223, + "num_tokens": 222542521.0, + "step": 5835 + }, + { + "epoch": 0.742399185854217, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.587856650352478, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8625895380973816, + "num_tokens": 222578186.0, + "step": 5836 + }, + { + "epoch": 0.7425263961328076, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.51694655418396, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8763588070869446, + "num_tokens": 222623236.0, + "step": 5837 + }, + { + "epoch": 0.742653606411398, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7334048748016357, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.848009467124939, + "num_tokens": 222659295.0, + "step": 5838 + }, + { + "epoch": 0.7427808166899885, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6326411962509155, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8667744398117065, + "num_tokens": 222695786.0, + "step": 5839 + }, + { + "epoch": 0.742908026968579, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6454614400863647, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.866350531578064, + "num_tokens": 222731897.0, + "step": 5840 + }, + { + "epoch": 0.7430352372471696, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.608681559562683, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8512052297592163, + "num_tokens": 222774292.0, + "step": 5841 + }, + { + "epoch": 0.7431624475257601, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5164973735809326, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8739840984344482, + "num_tokens": 222815648.0, + "step": 5842 + }, + { + "epoch": 0.7432896578043506, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.607802391052246, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8817463517189026, + "num_tokens": 222851118.0, + "step": 5843 + }, + { + "epoch": 0.743416868082941, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6981076002120972, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8574813604354858, + "num_tokens": 222887902.0, + "step": 5844 + }, + { + "epoch": 0.7435440783615316, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6569899320602417, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8716273903846741, + "num_tokens": 222924181.0, + "step": 5845 + }, + { + "epoch": 0.7436712886401221, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7453579902648926, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8510915040969849, + "num_tokens": 222958960.0, + "step": 5846 + }, + { + "epoch": 0.7437984989187126, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7691867351531982, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.871606707572937, + "num_tokens": 222990864.0, + "step": 5847 + }, + { + "epoch": 0.7439257091973032, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6334856748580933, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8770502209663391, + "num_tokens": 223026875.0, + "step": 5848 + }, + { + "epoch": 0.7440529194758937, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5998954772949219, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8496930599212646, + "num_tokens": 223067178.0, + "step": 5849 + }, + { + "epoch": 0.7441801297544841, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5385233163833618, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8673232793807983, + "num_tokens": 223110588.0, + "step": 5850 + }, + { + "epoch": 0.7443073400330746, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6945858001708984, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8657613396644592, + "num_tokens": 223145063.0, + "step": 5851 + }, + { + "epoch": 0.7444345503116652, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5998486280441284, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8557739853858948, + "num_tokens": 223182624.0, + "step": 5852 + }, + { + "epoch": 0.7445617605902557, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5602434873580933, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8773295283317566, + "num_tokens": 223219893.0, + "step": 5853 + }, + { + "epoch": 0.7446889708688462, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.783698558807373, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8648402094841003, + "num_tokens": 223254541.0, + "step": 5854 + }, + { + "epoch": 0.7448161811474368, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5181543827056885, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.876785159111023, + "num_tokens": 223296594.0, + "step": 5855 + }, + { + "epoch": 0.7449433914260272, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.8259344100952148, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8669674396514893, + "num_tokens": 223326773.0, + "step": 5856 + }, + { + "epoch": 0.7450706017046177, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5228763818740845, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8619771003723145, + "num_tokens": 223368112.0, + "step": 5857 + }, + { + "epoch": 0.7451978119832082, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5518722534179688, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8753622174263, + "num_tokens": 223407039.0, + "step": 5858 + }, + { + "epoch": 0.7453250222617988, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5663608312606812, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8544899821281433, + "num_tokens": 223448116.0, + "step": 5859 + }, + { + "epoch": 0.7454522325403893, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5925617218017578, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8671232461929321, + "num_tokens": 223489952.0, + "step": 5860 + }, + { + "epoch": 0.7455794428189798, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.482311487197876, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8857548832893372, + "num_tokens": 223530788.0, + "step": 5861 + }, + { + "epoch": 0.7457066530975703, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4884982109069824, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8707424998283386, + "num_tokens": 223571698.0, + "step": 5862 + }, + { + "epoch": 0.7458338633761608, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.8416380882263184, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8589323163032532, + "num_tokens": 223603785.0, + "step": 5863 + }, + { + "epoch": 0.7459610736547513, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5563170909881592, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8584814667701721, + "num_tokens": 223643819.0, + "step": 5864 + }, + { + "epoch": 0.7460882839333418, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4263046979904175, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8765966892242432, + "num_tokens": 223690412.0, + "step": 5865 + }, + { + "epoch": 0.7462154942119323, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6492547988891602, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8660132884979248, + "num_tokens": 223728101.0, + "step": 5866 + }, + { + "epoch": 0.7463427044905229, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6961734294891357, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8595237731933594, + "num_tokens": 223764863.0, + "step": 5867 + }, + { + "epoch": 0.7464699147691134, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7160688638687134, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8764187097549438, + "num_tokens": 223802162.0, + "step": 5868 + }, + { + "epoch": 0.7465971250477038, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6102838516235352, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8677386045455933, + "num_tokens": 223839519.0, + "step": 5869 + }, + { + "epoch": 0.7467243353262943, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.660203456878662, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8644687533378601, + "num_tokens": 223877278.0, + "step": 5870 + }, + { + "epoch": 0.7468515456048849, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5215256214141846, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8590197563171387, + "num_tokens": 223918136.0, + "step": 5871 + }, + { + "epoch": 0.7469787558834754, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.4599781036376953, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8735594153404236, + "num_tokens": 223959603.0, + "step": 5872 + }, + { + "epoch": 0.7471059661620659, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6549324989318848, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.874470591545105, + "num_tokens": 223997337.0, + "step": 5873 + }, + { + "epoch": 0.7472331764406565, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.8728151321411133, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8710921406745911, + "num_tokens": 224030987.0, + "step": 5874 + }, + { + "epoch": 0.7473603867192469, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.61231529712677, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8614157438278198, + "num_tokens": 224069863.0, + "step": 5875 + }, + { + "epoch": 0.7474875969978374, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.6064311265945435, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8602690100669861, + "num_tokens": 224110943.0, + "step": 5876 + }, + { + "epoch": 0.7476148072764279, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5800718069076538, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8679159879684448, + "num_tokens": 224146635.0, + "step": 5877 + }, + { + "epoch": 0.7477420175550185, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7256176471710205, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8664101362228394, + "num_tokens": 224178730.0, + "step": 5878 + }, + { + "epoch": 0.747869227833609, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.550477385520935, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8527688980102539, + "num_tokens": 224220715.0, + "step": 5879 + }, + { + "epoch": 0.7479964381121995, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5611852407455444, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8581559658050537, + "num_tokens": 224262997.0, + "step": 5880 + }, + { + "epoch": 0.7481236483907899, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5861773490905762, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8636661767959595, + "num_tokens": 224300954.0, + "step": 5881 + }, + { + "epoch": 0.7482508586693805, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.673394799232483, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8729037046432495, + "num_tokens": 224330507.0, + "step": 5882 + }, + { + "epoch": 0.748378068947971, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.552955985069275, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8696966171264648, + "num_tokens": 224371343.0, + "step": 5883 + }, + { + "epoch": 0.7485052792265615, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6114264726638794, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8748984336853027, + "num_tokens": 224404804.0, + "step": 5884 + }, + { + "epoch": 0.748632489505152, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5509204864501953, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8765333890914917, + "num_tokens": 224438596.0, + "step": 5885 + }, + { + "epoch": 0.7487596997837426, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.722660779953003, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8475877046585083, + "num_tokens": 224473136.0, + "step": 5886 + }, + { + "epoch": 0.748886910062333, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.5339664220809937, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8690700531005859, + "num_tokens": 224512097.0, + "step": 5887 + }, + { + "epoch": 0.7490141203409235, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5170416831970215, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8603910803794861, + "num_tokens": 224551698.0, + "step": 5888 + }, + { + "epoch": 0.749141330619514, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.7381998300552368, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8685231804847717, + "num_tokens": 224585038.0, + "step": 5889 + }, + { + "epoch": 0.7492685408981046, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5630015134811401, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8550511002540588, + "num_tokens": 224625185.0, + "step": 5890 + }, + { + "epoch": 0.7493957511766951, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7889553308486938, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.867007315158844, + "num_tokens": 224655457.0, + "step": 5891 + }, + { + "epoch": 0.7495229614552856, + "ewc_loss": 1.6808509826660156e-05, + "grad_norm": 1.563427209854126, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8643155097961426, + "num_tokens": 224695184.0, + "step": 5892 + }, + { + "epoch": 0.749650171733876, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.729278326034546, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8662653565406799, + "num_tokens": 224726436.0, + "step": 5893 + }, + { + "epoch": 0.7497773820124666, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5487115383148193, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8736371994018555, + "num_tokens": 224761354.0, + "step": 5894 + }, + { + "epoch": 0.7499045922910571, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.769105315208435, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8582803606987, + "num_tokens": 224796080.0, + "step": 5895 + }, + { + "epoch": 0.7500318025696476, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5799484252929688, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8647298216819763, + "num_tokens": 224837956.0, + "step": 5896 + }, + { + "epoch": 0.7501590128482382, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7381973266601562, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8509114980697632, + "num_tokens": 224871087.0, + "step": 5897 + }, + { + "epoch": 0.7502862231268287, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5627670288085938, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8534190058708191, + "num_tokens": 224911418.0, + "step": 5898 + }, + { + "epoch": 0.7504134334054191, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6234461069107056, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8666443824768066, + "num_tokens": 224950424.0, + "step": 5899 + }, + { + "epoch": 0.7505406436840096, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6647496223449707, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8618032932281494, + "num_tokens": 224983938.0, + "step": 5900 + }, + { + "epoch": 0.7506678539626002, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6898343563079834, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8668876886367798, + "num_tokens": 225019059.0, + "step": 5901 + }, + { + "epoch": 0.7507950642411907, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6656863689422607, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8473193645477295, + "num_tokens": 225057607.0, + "step": 5902 + }, + { + "epoch": 0.7509222745197812, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5366997718811035, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8629641532897949, + "num_tokens": 225096638.0, + "step": 5903 + }, + { + "epoch": 0.7510494847983717, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6603033542633057, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8621920347213745, + "num_tokens": 225130463.0, + "step": 5904 + }, + { + "epoch": 0.7511766950769622, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7047048807144165, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8664702773094177, + "num_tokens": 225162892.0, + "step": 5905 + }, + { + "epoch": 0.7513039053555527, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6997089385986328, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8501484990119934, + "num_tokens": 225197170.0, + "step": 5906 + }, + { + "epoch": 0.7514311156341432, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6019045114517212, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8664203882217407, + "num_tokens": 225233873.0, + "step": 5907 + }, + { + "epoch": 0.7515583259127337, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.57534658908844, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8690329790115356, + "num_tokens": 225272888.0, + "step": 5908 + }, + { + "epoch": 0.7516855361913243, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.569401741027832, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8618059158325195, + "num_tokens": 225312087.0, + "step": 5909 + }, + { + "epoch": 0.7518127464699148, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.496668815612793, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8704627156257629, + "num_tokens": 225354332.0, + "step": 5910 + }, + { + "epoch": 0.7519399567485053, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5435720682144165, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8793861269950867, + "num_tokens": 225388613.0, + "step": 5911 + }, + { + "epoch": 0.7520671670270958, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.778171420097351, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8427090644836426, + "num_tokens": 225423858.0, + "step": 5912 + }, + { + "epoch": 0.7521943773056863, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5992869138717651, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8733084201812744, + "num_tokens": 225462279.0, + "step": 5913 + }, + { + "epoch": 0.7523215875842768, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.8089683055877686, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8703035712242126, + "num_tokens": 225497423.0, + "step": 5914 + }, + { + "epoch": 0.7524487978628673, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6191296577453613, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8575726747512817, + "num_tokens": 225537697.0, + "step": 5915 + }, + { + "epoch": 0.7525760081414579, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 2.2820000648498535, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8708547353744507, + "num_tokens": 225571071.0, + "step": 5916 + }, + { + "epoch": 0.7527032184200484, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5714865922927856, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8626776337623596, + "num_tokens": 225610735.0, + "step": 5917 + }, + { + "epoch": 0.7528304286986388, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5250966548919678, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8799475431442261, + "num_tokens": 225647306.0, + "step": 5918 + }, + { + "epoch": 0.7529576389772293, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5969116687774658, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8566994667053223, + "num_tokens": 225686461.0, + "step": 5919 + }, + { + "epoch": 0.7530848492558199, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.4721496105194092, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8692278861999512, + "num_tokens": 225727292.0, + "step": 5920 + }, + { + "epoch": 0.7532120595344104, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.4917633533477783, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8706761598587036, + "num_tokens": 225768761.0, + "step": 5921 + }, + { + "epoch": 0.7533392698130009, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7236263751983643, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8603520393371582, + "num_tokens": 225803275.0, + "step": 5922 + }, + { + "epoch": 0.7534664800915915, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5653482675552368, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8616589903831482, + "num_tokens": 225840660.0, + "step": 5923 + }, + { + "epoch": 0.7535936903701819, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7259577512741089, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.856583833694458, + "num_tokens": 225875803.0, + "step": 5924 + }, + { + "epoch": 0.7537209006487724, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6197675466537476, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8693621158599854, + "num_tokens": 225909795.0, + "step": 5925 + }, + { + "epoch": 0.7538481109273629, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7203952074050903, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8528521060943604, + "num_tokens": 225943659.0, + "step": 5926 + }, + { + "epoch": 0.7539753212059535, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.543094277381897, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8733783960342407, + "num_tokens": 225985003.0, + "step": 5927 + }, + { + "epoch": 0.754102531484544, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5557746887207031, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8652986288070679, + "num_tokens": 226022169.0, + "step": 5928 + }, + { + "epoch": 0.7542297417631345, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5045280456542969, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.885931134223938, + "num_tokens": 226057448.0, + "step": 5929 + }, + { + "epoch": 0.7543569520417249, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.361433506011963, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.887954592704773, + "num_tokens": 226101784.0, + "step": 5930 + }, + { + "epoch": 0.7544841623203155, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.4652554988861084, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8789385557174683, + "num_tokens": 226141654.0, + "step": 5931 + }, + { + "epoch": 0.754611372598906, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5955392122268677, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.874869167804718, + "num_tokens": 226175071.0, + "step": 5932 + }, + { + "epoch": 0.7547385828774965, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.488923192024231, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8669875860214233, + "num_tokens": 226215871.0, + "step": 5933 + }, + { + "epoch": 0.754865793156087, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 4.876928329467773, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8561205863952637, + "num_tokens": 226250955.0, + "step": 5934 + }, + { + "epoch": 0.7549930034346776, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.673512578010559, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8563488721847534, + "num_tokens": 226282793.0, + "step": 5935 + }, + { + "epoch": 0.755120213713268, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.4029604196548462, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8688080906867981, + "num_tokens": 226325926.0, + "step": 5936 + }, + { + "epoch": 0.7552474239918585, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5183544158935547, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8718997240066528, + "num_tokens": 226364094.0, + "step": 5937 + }, + { + "epoch": 0.755374634270449, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.4339138269424438, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8711874485015869, + "num_tokens": 226406239.0, + "step": 5938 + }, + { + "epoch": 0.7555018445490396, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5149922370910645, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8658672571182251, + "num_tokens": 226444141.0, + "step": 5939 + }, + { + "epoch": 0.7556290548276301, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7104554176330566, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8655616044998169, + "num_tokens": 226476668.0, + "step": 5940 + }, + { + "epoch": 0.7557562651062206, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.9332752227783203, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8594637513160706, + "num_tokens": 226508024.0, + "step": 5941 + }, + { + "epoch": 0.755883475384811, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6052591800689697, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8446856737136841, + "num_tokens": 226552221.0, + "step": 5942 + }, + { + "epoch": 0.7560106856634016, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.697351098060608, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8610919713973999, + "num_tokens": 226588838.0, + "step": 5943 + }, + { + "epoch": 0.7561378959419921, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5988168716430664, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8734617829322815, + "num_tokens": 226628650.0, + "step": 5944 + }, + { + "epoch": 0.7562651062205826, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.783913493156433, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.858478307723999, + "num_tokens": 226662363.0, + "step": 5945 + }, + { + "epoch": 0.7563923164991732, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6187978982925415, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8485749959945679, + "num_tokens": 226702742.0, + "step": 5946 + }, + { + "epoch": 0.7565195267777637, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.6505845785140991, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8648850917816162, + "num_tokens": 226740041.0, + "step": 5947 + }, + { + "epoch": 0.7566467370563541, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.4409596920013428, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8615186214447021, + "num_tokens": 226785978.0, + "step": 5948 + }, + { + "epoch": 0.7567739473349446, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.578331708908081, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.857956051826477, + "num_tokens": 226823693.0, + "step": 5949 + }, + { + "epoch": 0.7569011576135352, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5815889835357666, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8773864507675171, + "num_tokens": 226858716.0, + "step": 5950 + }, + { + "epoch": 0.7570283678921257, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.48188316822052, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8801237344741821, + "num_tokens": 226899398.0, + "step": 5951 + }, + { + "epoch": 0.7571555781707162, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5473800897598267, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8832217454910278, + "num_tokens": 226933960.0, + "step": 5952 + }, + { + "epoch": 0.7572827884493067, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.6470268964767456, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8685317039489746, + "num_tokens": 226968529.0, + "step": 5953 + }, + { + "epoch": 0.7574099987278972, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.4772875308990479, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8710682988166809, + "num_tokens": 227010050.0, + "step": 5954 + }, + { + "epoch": 0.7575372090064877, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.4695703983306885, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8763359785079956, + "num_tokens": 227053300.0, + "step": 5955 + }, + { + "epoch": 0.7576644192850782, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.5952584743499756, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8693199157714844, + "num_tokens": 227088741.0, + "step": 5956 + }, + { + "epoch": 0.7577916295636687, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.4596529006958008, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8679771423339844, + "num_tokens": 227131534.0, + "step": 5957 + }, + { + "epoch": 0.7579188398422593, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7865043878555298, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8761463165283203, + "num_tokens": 227165361.0, + "step": 5958 + }, + { + "epoch": 0.7580460501208498, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5827690362930298, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8700644969940186, + "num_tokens": 227203304.0, + "step": 5959 + }, + { + "epoch": 0.7581732603994403, + "ewc_loss": 1.6927719116210938e-05, + "grad_norm": 1.7033650875091553, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8622514009475708, + "num_tokens": 227240410.0, + "step": 5960 + }, + { + "epoch": 0.7583004706780307, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.6352386474609375, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8568381667137146, + "num_tokens": 227278537.0, + "step": 5961 + }, + { + "epoch": 0.7584276809566213, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.5778656005859375, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8477339744567871, + "num_tokens": 227321746.0, + "step": 5962 + }, + { + "epoch": 0.7585548912352118, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.7878940105438232, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8680649399757385, + "num_tokens": 227356487.0, + "step": 5963 + }, + { + "epoch": 0.7586821015138023, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.635480284690857, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8687357902526855, + "num_tokens": 227391682.0, + "step": 5964 + }, + { + "epoch": 0.7588093117923929, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.584326982498169, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8798147439956665, + "num_tokens": 227424791.0, + "step": 5965 + }, + { + "epoch": 0.7589365220709834, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.7156542539596558, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8722003698348999, + "num_tokens": 227454257.0, + "step": 5966 + }, + { + "epoch": 0.7590637323495738, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.7423181533813477, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8620444536209106, + "num_tokens": 227493310.0, + "step": 5967 + }, + { + "epoch": 0.7591909426281643, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.702825903892517, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8657100200653076, + "num_tokens": 227531587.0, + "step": 5968 + }, + { + "epoch": 0.7593181529067549, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.5575692653656006, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8685232400894165, + "num_tokens": 227572321.0, + "step": 5969 + }, + { + "epoch": 0.7594453631853454, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.749221920967102, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8576449155807495, + "num_tokens": 227607013.0, + "step": 5970 + }, + { + "epoch": 0.7595725734639359, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.5525774955749512, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8531525135040283, + "num_tokens": 227649779.0, + "step": 5971 + }, + { + "epoch": 0.7596997837425264, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.6509324312210083, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8491814136505127, + "num_tokens": 227690191.0, + "step": 5972 + }, + { + "epoch": 0.7598269940211169, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.729612112045288, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8623820543289185, + "num_tokens": 227725491.0, + "step": 5973 + }, + { + "epoch": 0.7599542042997074, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.451634407043457, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8831843137741089, + "num_tokens": 227763950.0, + "step": 5974 + }, + { + "epoch": 0.7600814145782979, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.5935852527618408, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8700819611549377, + "num_tokens": 227799200.0, + "step": 5975 + }, + { + "epoch": 0.7602086248568884, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.6432433128356934, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8523275256156921, + "num_tokens": 227839100.0, + "step": 5976 + }, + { + "epoch": 0.760335835135479, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.7559820413589478, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8526759147644043, + "num_tokens": 227869927.0, + "step": 5977 + }, + { + "epoch": 0.7604630454140695, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5399092435836792, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8699815273284912, + "num_tokens": 227907598.0, + "step": 5978 + }, + { + "epoch": 0.7605902556926599, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.7289611101150513, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8621997833251953, + "num_tokens": 227940142.0, + "step": 5979 + }, + { + "epoch": 0.7607174659712505, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5310883522033691, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8577954173088074, + "num_tokens": 227980405.0, + "step": 5980 + }, + { + "epoch": 0.760844676249841, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.525680422782898, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8746568560600281, + "num_tokens": 228023196.0, + "step": 5981 + }, + { + "epoch": 0.7609718865284315, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5679692029953003, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8531638383865356, + "num_tokens": 228061532.0, + "step": 5982 + }, + { + "epoch": 0.761099096807022, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5901929140090942, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8679563403129578, + "num_tokens": 228100474.0, + "step": 5983 + }, + { + "epoch": 0.7612263070856126, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.586869716644287, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8582115173339844, + "num_tokens": 228138456.0, + "step": 5984 + }, + { + "epoch": 0.761353517364203, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6972893476486206, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.850242018699646, + "num_tokens": 228174495.0, + "step": 5985 + }, + { + "epoch": 0.7614807276427935, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6377017498016357, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8667309284210205, + "num_tokens": 228212765.0, + "step": 5986 + }, + { + "epoch": 0.761607937921384, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.589695692062378, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8581401705741882, + "num_tokens": 228252039.0, + "step": 5987 + }, + { + "epoch": 0.7617351481999746, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5758570432662964, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8566809296607971, + "num_tokens": 228291711.0, + "step": 5988 + }, + { + "epoch": 0.7618623584785651, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5718990564346313, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.861798107624054, + "num_tokens": 228330074.0, + "step": 5989 + }, + { + "epoch": 0.7619895687571556, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6233627796173096, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8799285292625427, + "num_tokens": 228364814.0, + "step": 5990 + }, + { + "epoch": 0.762116779035746, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6400660276412964, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.86945641040802, + "num_tokens": 228400012.0, + "step": 5991 + }, + { + "epoch": 0.7622439893143366, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.4777904748916626, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8813372254371643, + "num_tokens": 228436684.0, + "step": 5992 + }, + { + "epoch": 0.7623711995929271, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6154475212097168, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8823085427284241, + "num_tokens": 228468905.0, + "step": 5993 + }, + { + "epoch": 0.7624984098715176, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.691177487373352, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8641737699508667, + "num_tokens": 228502689.0, + "step": 5994 + }, + { + "epoch": 0.7626256201501082, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.4904853105545044, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8700743913650513, + "num_tokens": 228545853.0, + "step": 5995 + }, + { + "epoch": 0.7627528304286987, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5660992860794067, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8737322688102722, + "num_tokens": 228582029.0, + "step": 5996 + }, + { + "epoch": 0.7628800407072891, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5355091094970703, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.854785680770874, + "num_tokens": 228624082.0, + "step": 5997 + }, + { + "epoch": 0.7630072509858796, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5699236392974854, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8675135374069214, + "num_tokens": 228659351.0, + "step": 5998 + }, + { + "epoch": 0.7631344612644702, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.514965295791626, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8694935441017151, + "num_tokens": 228699981.0, + "step": 5999 + }, + { + "epoch": 0.7632616715430607, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.509944200515747, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8689279556274414, + "num_tokens": 228743354.0, + "step": 6000 + }, + { + "epoch": 0.7633888818216512, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6414891481399536, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8516294360160828, + "num_tokens": 228779087.0, + "step": 6001 + }, + { + "epoch": 0.7635160921002417, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6331855058670044, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8801785707473755, + "num_tokens": 228817140.0, + "step": 6002 + }, + { + "epoch": 0.7636433023788322, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.636598825454712, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8464192152023315, + "num_tokens": 228858600.0, + "step": 6003 + }, + { + "epoch": 0.7637705126574227, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6083532571792603, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8769022226333618, + "num_tokens": 228893487.0, + "step": 6004 + }, + { + "epoch": 0.7638977229360132, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.493203043937683, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8674157857894897, + "num_tokens": 228935592.0, + "step": 6005 + }, + { + "epoch": 0.7640249332146037, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.7183244228363037, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8666882514953613, + "num_tokens": 228971770.0, + "step": 6006 + }, + { + "epoch": 0.7641521434931943, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5382874011993408, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8612270355224609, + "num_tokens": 229016830.0, + "step": 6007 + }, + { + "epoch": 0.7642793537717848, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6452783346176147, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8594943284988403, + "num_tokens": 229055275.0, + "step": 6008 + }, + { + "epoch": 0.7644065640503753, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5517833232879639, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8821530938148499, + "num_tokens": 229089929.0, + "step": 6009 + }, + { + "epoch": 0.7645337743289657, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.4113223552703857, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8723682761192322, + "num_tokens": 229132994.0, + "step": 6010 + }, + { + "epoch": 0.7646609846075563, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5336580276489258, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8692626953125, + "num_tokens": 229174447.0, + "step": 6011 + }, + { + "epoch": 0.7647881948861468, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6410892009735107, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8677069544792175, + "num_tokens": 229208662.0, + "step": 6012 + }, + { + "epoch": 0.7649154051647373, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.583960771560669, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8769562840461731, + "num_tokens": 229245088.0, + "step": 6013 + }, + { + "epoch": 0.7650426154433279, + "ewc_loss": 1.704692840576172e-05, + "grad_norm": 1.639979362487793, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8622866868972778, + "num_tokens": 229282847.0, + "step": 6014 + }, + { + "epoch": 0.7651698257219184, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6288739442825317, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8633782863616943, + "num_tokens": 229320181.0, + "step": 6015 + }, + { + "epoch": 0.7652970360005088, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6143931150436401, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8558782339096069, + "num_tokens": 229358605.0, + "step": 6016 + }, + { + "epoch": 0.7654242462790993, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.7862842082977295, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8664470911026001, + "num_tokens": 229391098.0, + "step": 6017 + }, + { + "epoch": 0.7655514565576899, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.462630033493042, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8688178658485413, + "num_tokens": 229436244.0, + "step": 6018 + }, + { + "epoch": 0.7656786668362804, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6661789417266846, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8594900369644165, + "num_tokens": 229470059.0, + "step": 6019 + }, + { + "epoch": 0.7658058771148709, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6451760530471802, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8677953481674194, + "num_tokens": 229505591.0, + "step": 6020 + }, + { + "epoch": 0.7659330873934614, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6639782190322876, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.872298538684845, + "num_tokens": 229539174.0, + "step": 6021 + }, + { + "epoch": 0.7660602976720519, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6394366025924683, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8552956581115723, + "num_tokens": 229573374.0, + "step": 6022 + }, + { + "epoch": 0.7661875079506424, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.488142967224121, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.858027994632721, + "num_tokens": 229615553.0, + "step": 6023 + }, + { + "epoch": 0.7663147182292329, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5624173879623413, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8633787631988525, + "num_tokens": 229654542.0, + "step": 6024 + }, + { + "epoch": 0.7664419285078234, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5863533020019531, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.864772617816925, + "num_tokens": 229695292.0, + "step": 6025 + }, + { + "epoch": 0.766569138786414, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5277153253555298, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.865084171295166, + "num_tokens": 229738077.0, + "step": 6026 + }, + { + "epoch": 0.7666963490650045, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5532667636871338, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8730278015136719, + "num_tokens": 229779000.0, + "step": 6027 + }, + { + "epoch": 0.7668235593435949, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6323927640914917, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8606656789779663, + "num_tokens": 229817864.0, + "step": 6028 + }, + { + "epoch": 0.7669507696221854, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5498794317245483, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8683929443359375, + "num_tokens": 229855824.0, + "step": 6029 + }, + { + "epoch": 0.767077979900776, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6517846584320068, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8647621870040894, + "num_tokens": 229889872.0, + "step": 6030 + }, + { + "epoch": 0.7672051901793665, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6880981922149658, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.842064380645752, + "num_tokens": 229927716.0, + "step": 6031 + }, + { + "epoch": 0.767332400457957, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.6928212642669678, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8678468465805054, + "num_tokens": 229960410.0, + "step": 6032 + }, + { + "epoch": 0.7674596107365476, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6967371702194214, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.861525297164917, + "num_tokens": 229995271.0, + "step": 6033 + }, + { + "epoch": 0.767586821015138, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.59306800365448, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8524167537689209, + "num_tokens": 230038051.0, + "step": 6034 + }, + { + "epoch": 0.7677140312937285, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.539954423904419, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8643008470535278, + "num_tokens": 230080909.0, + "step": 6035 + }, + { + "epoch": 0.767841241572319, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 2.2446353435516357, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8458097577095032, + "num_tokens": 230117168.0, + "step": 6036 + }, + { + "epoch": 0.7679684518509096, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6176915168762207, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.839034914970398, + "num_tokens": 230156291.0, + "step": 6037 + }, + { + "epoch": 0.7680956621295001, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5981203317642212, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.849705159664154, + "num_tokens": 230198617.0, + "step": 6038 + }, + { + "epoch": 0.7682228724080906, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7164570093154907, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8607414364814758, + "num_tokens": 230231312.0, + "step": 6039 + }, + { + "epoch": 0.768350082686681, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.631830096244812, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8686144948005676, + "num_tokens": 230265605.0, + "step": 6040 + }, + { + "epoch": 0.7684772929652716, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.565850853919983, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8565278053283691, + "num_tokens": 230304977.0, + "step": 6041 + }, + { + "epoch": 0.7686045032438621, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5499935150146484, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8603171110153198, + "num_tokens": 230344870.0, + "step": 6042 + }, + { + "epoch": 0.7687317135224526, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5283445119857788, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8785530924797058, + "num_tokens": 230383803.0, + "step": 6043 + }, + { + "epoch": 0.7688589238010431, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5251665115356445, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8687292337417603, + "num_tokens": 230423522.0, + "step": 6044 + }, + { + "epoch": 0.7689861340796337, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4702492952346802, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8523993492126465, + "num_tokens": 230471437.0, + "step": 6045 + }, + { + "epoch": 0.7691133443582241, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7769140005111694, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.852160632610321, + "num_tokens": 230503300.0, + "step": 6046 + }, + { + "epoch": 0.7692405546368146, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6822857856750488, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8540825843811035, + "num_tokens": 230544070.0, + "step": 6047 + }, + { + "epoch": 0.7693677649154052, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4504311084747314, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8696643114089966, + "num_tokens": 230586573.0, + "step": 6048 + }, + { + "epoch": 0.7694949751939957, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.571057915687561, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8506821393966675, + "num_tokens": 230629136.0, + "step": 6049 + }, + { + "epoch": 0.7696221854725862, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4909248352050781, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8638468980789185, + "num_tokens": 230667556.0, + "step": 6050 + }, + { + "epoch": 0.7697493957511767, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4997562170028687, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8639658093452454, + "num_tokens": 230706307.0, + "step": 6051 + }, + { + "epoch": 0.7698766060297672, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5209909677505493, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8591145873069763, + "num_tokens": 230746023.0, + "step": 6052 + }, + { + "epoch": 0.7700038163083577, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.728416085243225, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8597663640975952, + "num_tokens": 230781129.0, + "step": 6053 + }, + { + "epoch": 0.7701310265869482, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.644142508506775, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8613297343254089, + "num_tokens": 230819450.0, + "step": 6054 + }, + { + "epoch": 0.7702582368655387, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5702310800552368, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8732044696807861, + "num_tokens": 230858645.0, + "step": 6055 + }, + { + "epoch": 0.7703854471441293, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4577438831329346, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8764525651931763, + "num_tokens": 230898414.0, + "step": 6056 + }, + { + "epoch": 0.7705126574227198, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5475083589553833, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8799932599067688, + "num_tokens": 230934811.0, + "step": 6057 + }, + { + "epoch": 0.7706398677013102, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7330663204193115, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8572925925254822, + "num_tokens": 230974158.0, + "step": 6058 + }, + { + "epoch": 0.7707670779799007, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4377567768096924, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8621208071708679, + "num_tokens": 231021018.0, + "step": 6059 + }, + { + "epoch": 0.7708942882584913, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.671382188796997, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.856236457824707, + "num_tokens": 231059406.0, + "step": 6060 + }, + { + "epoch": 0.7710214985370818, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4890652894973755, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8740863800048828, + "num_tokens": 231100409.0, + "step": 6061 + }, + { + "epoch": 0.7711487088156723, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4912397861480713, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8757395148277283, + "num_tokens": 231136370.0, + "step": 6062 + }, + { + "epoch": 0.7712759190942629, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5817692279815674, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8724595308303833, + "num_tokens": 231170610.0, + "step": 6063 + }, + { + "epoch": 0.7714031293728534, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7499768733978271, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8383784294128418, + "num_tokens": 231211000.0, + "step": 6064 + }, + { + "epoch": 0.7715303396514438, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.581701397895813, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8717638254165649, + "num_tokens": 231250144.0, + "step": 6065 + }, + { + "epoch": 0.7716575499300343, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6499356031417847, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8562557697296143, + "num_tokens": 231290412.0, + "step": 6066 + }, + { + "epoch": 0.7717847602086249, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5767754316329956, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8626895546913147, + "num_tokens": 231328057.0, + "step": 6067 + }, + { + "epoch": 0.7719119704872154, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5178563594818115, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8757309913635254, + "num_tokens": 231366039.0, + "step": 6068 + }, + { + "epoch": 0.7720391807658059, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6690160036087036, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8527697324752808, + "num_tokens": 231404092.0, + "step": 6069 + }, + { + "epoch": 0.7721663910443964, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5854724645614624, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8547950387001038, + "num_tokens": 231444120.0, + "step": 6070 + }, + { + "epoch": 0.7722936013229869, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5544178485870361, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8642734289169312, + "num_tokens": 231481139.0, + "step": 6071 + }, + { + "epoch": 0.7724208116015774, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5446906089782715, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8838990926742554, + "num_tokens": 231520179.0, + "step": 6072 + }, + { + "epoch": 0.7725480218801679, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5949598550796509, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8547110557556152, + "num_tokens": 231559070.0, + "step": 6073 + }, + { + "epoch": 0.7726752321587584, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.544387936592102, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8617334365844727, + "num_tokens": 231598451.0, + "step": 6074 + }, + { + "epoch": 0.772802442437349, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7778193950653076, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8610435724258423, + "num_tokens": 231637167.0, + "step": 6075 + }, + { + "epoch": 0.7729296527159395, + "ewc_loss": 1.71661376953125e-05, + "grad_norm": 1.5167642831802368, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8721587657928467, + "num_tokens": 231673341.0, + "step": 6076 + }, + { + "epoch": 0.7730568629945299, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5933526754379272, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8687655925750732, + "num_tokens": 231711391.0, + "step": 6077 + }, + { + "epoch": 0.7731840732731204, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.422381043434143, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8729832172393799, + "num_tokens": 231756467.0, + "step": 6078 + }, + { + "epoch": 0.773311283551711, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6036218404769897, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8580356240272522, + "num_tokens": 231801963.0, + "step": 6079 + }, + { + "epoch": 0.7734384938303015, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.48329758644104, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8780802488327026, + "num_tokens": 231841708.0, + "step": 6080 + }, + { + "epoch": 0.773565704108892, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.545174241065979, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.866870105266571, + "num_tokens": 231878142.0, + "step": 6081 + }, + { + "epoch": 0.7736929143874826, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.498099446296692, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8596000075340271, + "num_tokens": 231920318.0, + "step": 6082 + }, + { + "epoch": 0.773820124666073, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.589677095413208, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.882581353187561, + "num_tokens": 231959218.0, + "step": 6083 + }, + { + "epoch": 0.7739473349446635, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.606522798538208, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8737236261367798, + "num_tokens": 231994676.0, + "step": 6084 + }, + { + "epoch": 0.774074545223254, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6189173460006714, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.861944317817688, + "num_tokens": 232031896.0, + "step": 6085 + }, + { + "epoch": 0.7742017555018446, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7040503025054932, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8657508492469788, + "num_tokens": 232072054.0, + "step": 6086 + }, + { + "epoch": 0.7743289657804351, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6413531303405762, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8521670699119568, + "num_tokens": 232110665.0, + "step": 6087 + }, + { + "epoch": 0.7744561760590256, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5343912839889526, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8680118322372437, + "num_tokens": 232150677.0, + "step": 6088 + }, + { + "epoch": 0.774583386337616, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7691651582717896, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8586761951446533, + "num_tokens": 232184651.0, + "step": 6089 + }, + { + "epoch": 0.7747105966162066, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.685062289237976, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.850951075553894, + "num_tokens": 232221580.0, + "step": 6090 + }, + { + "epoch": 0.7748378068947971, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5347731113433838, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8579297661781311, + "num_tokens": 232265199.0, + "step": 6091 + }, + { + "epoch": 0.7749650171733876, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5994230508804321, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.869763195514679, + "num_tokens": 232304923.0, + "step": 6092 + }, + { + "epoch": 0.7750922274519781, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5824300050735474, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8592495918273926, + "num_tokens": 232345933.0, + "step": 6093 + }, + { + "epoch": 0.7752194377305687, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7703007459640503, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8512112498283386, + "num_tokens": 232379044.0, + "step": 6094 + }, + { + "epoch": 0.7753466480091591, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.8321638107299805, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8616305589675903, + "num_tokens": 232409678.0, + "step": 6095 + }, + { + "epoch": 0.7754738582877496, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.56790292263031, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.867633581161499, + "num_tokens": 232447518.0, + "step": 6096 + }, + { + "epoch": 0.7756010685663401, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5474194288253784, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8677526712417603, + "num_tokens": 232486802.0, + "step": 6097 + }, + { + "epoch": 0.7757282788449307, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.70132577419281, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8611853122711182, + "num_tokens": 232523187.0, + "step": 6098 + }, + { + "epoch": 0.7758554891235212, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5363502502441406, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8761086463928223, + "num_tokens": 232561597.0, + "step": 6099 + }, + { + "epoch": 0.7759826994021117, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5872114896774292, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8696308732032776, + "num_tokens": 232599497.0, + "step": 6100 + }, + { + "epoch": 0.7761099096807021, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.9128872156143188, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8690910935401917, + "num_tokens": 232624796.0, + "step": 6101 + }, + { + "epoch": 0.7762371199592927, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.377977728843689, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.88255375623703, + "num_tokens": 232669230.0, + "step": 6102 + }, + { + "epoch": 0.7763643302378832, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.725415825843811, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8493995666503906, + "num_tokens": 232712914.0, + "step": 6103 + }, + { + "epoch": 0.7764915405164737, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5906347036361694, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8746059536933899, + "num_tokens": 232750049.0, + "step": 6104 + }, + { + "epoch": 0.7766187507950643, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.480063557624817, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8769741058349609, + "num_tokens": 232792051.0, + "step": 6105 + }, + { + "epoch": 0.7767459610736548, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5686960220336914, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8610708117485046, + "num_tokens": 232828765.0, + "step": 6106 + }, + { + "epoch": 0.7768731713522452, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5056339502334595, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.875617504119873, + "num_tokens": 232864550.0, + "step": 6107 + }, + { + "epoch": 0.7770003816308357, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5750133991241455, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8707952499389648, + "num_tokens": 232904045.0, + "step": 6108 + }, + { + "epoch": 0.7771275919094263, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7012170553207397, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8607296943664551, + "num_tokens": 232943883.0, + "step": 6109 + }, + { + "epoch": 0.7772548021880168, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6538829803466797, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8587835431098938, + "num_tokens": 232979671.0, + "step": 6110 + }, + { + "epoch": 0.7773820124666073, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.509822964668274, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8594765663146973, + "num_tokens": 233019826.0, + "step": 6111 + }, + { + "epoch": 0.7775092227451978, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6762380599975586, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8716011643409729, + "num_tokens": 233053390.0, + "step": 6112 + }, + { + "epoch": 0.7776364330237884, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6516978740692139, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8464736342430115, + "num_tokens": 233091382.0, + "step": 6113 + }, + { + "epoch": 0.7777636433023788, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.706511378288269, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8613870739936829, + "num_tokens": 233127576.0, + "step": 6114 + }, + { + "epoch": 0.7778908535809693, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.605027675628662, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8540540337562561, + "num_tokens": 233164936.0, + "step": 6115 + }, + { + "epoch": 0.7780180638595598, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6259502172470093, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.862230658531189, + "num_tokens": 233203896.0, + "step": 6116 + }, + { + "epoch": 0.7781452741381504, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4764608144760132, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8774489164352417, + "num_tokens": 233243314.0, + "step": 6117 + }, + { + "epoch": 0.7782724844167409, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5273594856262207, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8643136024475098, + "num_tokens": 233286065.0, + "step": 6118 + }, + { + "epoch": 0.7783996946953314, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5977389812469482, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8615677356719971, + "num_tokens": 233325297.0, + "step": 6119 + }, + { + "epoch": 0.7785269049739219, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6177822351455688, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.870029628276825, + "num_tokens": 233363674.0, + "step": 6120 + }, + { + "epoch": 0.7786541152525124, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5722503662109375, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8722646832466125, + "num_tokens": 233401042.0, + "step": 6121 + }, + { + "epoch": 0.7787813255311029, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4866453409194946, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8665494918823242, + "num_tokens": 233441486.0, + "step": 6122 + }, + { + "epoch": 0.7789085358096934, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4244437217712402, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8584656119346619, + "num_tokens": 233485718.0, + "step": 6123 + }, + { + "epoch": 0.779035746088284, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7179489135742188, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8608640432357788, + "num_tokens": 233523216.0, + "step": 6124 + }, + { + "epoch": 0.7791629563668745, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5803022384643555, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8514008522033691, + "num_tokens": 233564874.0, + "step": 6125 + }, + { + "epoch": 0.7792901666454649, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6829572916030884, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8719835877418518, + "num_tokens": 233594127.0, + "step": 6126 + }, + { + "epoch": 0.7794173769240554, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6324037313461304, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8519858121871948, + "num_tokens": 233632540.0, + "step": 6127 + }, + { + "epoch": 0.779544587202646, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5846768617630005, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8547185063362122, + "num_tokens": 233677474.0, + "step": 6128 + }, + { + "epoch": 0.7796717974812365, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.659544825553894, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.864444375038147, + "num_tokens": 233715213.0, + "step": 6129 + }, + { + "epoch": 0.779799007759827, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5707039833068848, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8687260150909424, + "num_tokens": 233754471.0, + "step": 6130 + }, + { + "epoch": 0.7799262180384176, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6887648105621338, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8678407669067383, + "num_tokens": 233791761.0, + "step": 6131 + }, + { + "epoch": 0.780053428317008, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.4442778825759888, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.864641010761261, + "num_tokens": 233834617.0, + "step": 6132 + }, + { + "epoch": 0.7801806385955985, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.653303623199463, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8652859330177307, + "num_tokens": 233873434.0, + "step": 6133 + }, + { + "epoch": 0.780307848874189, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.711921215057373, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8523996472358704, + "num_tokens": 233909836.0, + "step": 6134 + }, + { + "epoch": 0.7804350591527796, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.5550810098648071, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8707276582717896, + "num_tokens": 233950040.0, + "step": 6135 + }, + { + "epoch": 0.7805622694313701, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.7770384550094604, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8754709959030151, + "num_tokens": 233981588.0, + "step": 6136 + }, + { + "epoch": 0.7806894797099606, + "ewc_loss": 1.728534698486328e-05, + "grad_norm": 1.6282323598861694, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8759241104125977, + "num_tokens": 234016591.0, + "step": 6137 + }, + { + "epoch": 0.780816689988551, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.6043320894241333, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8682082295417786, + "num_tokens": 234051154.0, + "step": 6138 + }, + { + "epoch": 0.7809439002671416, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.7509784698486328, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8742960691452026, + "num_tokens": 234086022.0, + "step": 6139 + }, + { + "epoch": 0.7810711105457321, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.6553184986114502, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8516080379486084, + "num_tokens": 234125627.0, + "step": 6140 + }, + { + "epoch": 0.7811983208243226, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.5541741847991943, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8751347661018372, + "num_tokens": 234170949.0, + "step": 6141 + }, + { + "epoch": 0.7813255311029131, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.6454944610595703, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.864393413066864, + "num_tokens": 234206564.0, + "step": 6142 + }, + { + "epoch": 0.7814527413815037, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.476812481880188, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8703398108482361, + "num_tokens": 234249024.0, + "step": 6143 + }, + { + "epoch": 0.7815799516600941, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.5012731552124023, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8635427951812744, + "num_tokens": 234289722.0, + "step": 6144 + }, + { + "epoch": 0.7817071619386846, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.670075535774231, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8641210198402405, + "num_tokens": 234321931.0, + "step": 6145 + }, + { + "epoch": 0.7818343722172751, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.5618467330932617, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8681992292404175, + "num_tokens": 234361278.0, + "step": 6146 + }, + { + "epoch": 0.7819615824958657, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.7406591176986694, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8467302322387695, + "num_tokens": 234394858.0, + "step": 6147 + }, + { + "epoch": 0.7820887927744562, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.6865544319152832, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8554007411003113, + "num_tokens": 234431607.0, + "step": 6148 + }, + { + "epoch": 0.7822160030530467, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.601269245147705, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8620672821998596, + "num_tokens": 234469830.0, + "step": 6149 + }, + { + "epoch": 0.7823432133316371, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.7323180437088013, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8378381729125977, + "num_tokens": 234504213.0, + "step": 6150 + }, + { + "epoch": 0.7824704236102277, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.8444652557373047, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8584439158439636, + "num_tokens": 234537956.0, + "step": 6151 + }, + { + "epoch": 0.7825976338888182, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.6541591882705688, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8559358716011047, + "num_tokens": 234581179.0, + "step": 6152 + }, + { + "epoch": 0.7827248441674087, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.685099482536316, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.858741819858551, + "num_tokens": 234616673.0, + "step": 6153 + }, + { + "epoch": 0.7828520544459993, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.4855358600616455, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8648025989532471, + "num_tokens": 234663602.0, + "step": 6154 + }, + { + "epoch": 0.7829792647245898, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.5185632705688477, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8666787147521973, + "num_tokens": 234705400.0, + "step": 6155 + }, + { + "epoch": 0.7831064750031802, + "ewc_loss": 1.7404556274414062e-05, + "grad_norm": 1.4951744079589844, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8810192346572876, + "num_tokens": 234742298.0, + "step": 6156 + }, + { + "epoch": 0.7832336852817707, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.718388319015503, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8544319868087769, + "num_tokens": 234779398.0, + "step": 6157 + }, + { + "epoch": 0.7833608955603613, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6920907497406006, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8665047883987427, + "num_tokens": 234815318.0, + "step": 6158 + }, + { + "epoch": 0.7834881058389518, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5610182285308838, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.848857581615448, + "num_tokens": 234855479.0, + "step": 6159 + }, + { + "epoch": 0.7836153161175423, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5149191617965698, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8745455741882324, + "num_tokens": 234894045.0, + "step": 6160 + }, + { + "epoch": 0.7837425263961328, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.8046212196350098, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8521283864974976, + "num_tokens": 234929968.0, + "step": 6161 + }, + { + "epoch": 0.7838697366747234, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.760006070137024, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8629831075668335, + "num_tokens": 234965081.0, + "step": 6162 + }, + { + "epoch": 0.7839969469533138, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5460400581359863, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8621505498886108, + "num_tokens": 235002791.0, + "step": 6163 + }, + { + "epoch": 0.7841241572319043, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6682288646697998, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8535361886024475, + "num_tokens": 235037963.0, + "step": 6164 + }, + { + "epoch": 0.7842513675104948, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.571010708808899, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8491818904876709, + "num_tokens": 235080084.0, + "step": 6165 + }, + { + "epoch": 0.7843785777890854, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6003493070602417, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8713447451591492, + "num_tokens": 235119583.0, + "step": 6166 + }, + { + "epoch": 0.7845057880676759, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6460261344909668, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8609486222267151, + "num_tokens": 235155771.0, + "step": 6167 + }, + { + "epoch": 0.7846329983462664, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.7615649700164795, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8581720590591431, + "num_tokens": 235188124.0, + "step": 6168 + }, + { + "epoch": 0.7847602086248568, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5080585479736328, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8545107245445251, + "num_tokens": 235234070.0, + "step": 6169 + }, + { + "epoch": 0.7848874189034474, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.8758550882339478, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.855475902557373, + "num_tokens": 235270653.0, + "step": 6170 + }, + { + "epoch": 0.7850146291820379, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5721148252487183, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8651115298271179, + "num_tokens": 235309267.0, + "step": 6171 + }, + { + "epoch": 0.7851418394606284, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5905652046203613, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8685051202774048, + "num_tokens": 235349406.0, + "step": 6172 + }, + { + "epoch": 0.785269049739219, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.7051546573638916, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8623900413513184, + "num_tokens": 235382979.0, + "step": 6173 + }, + { + "epoch": 0.7853962600178095, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5252273082733154, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8788840770721436, + "num_tokens": 235417539.0, + "step": 6174 + }, + { + "epoch": 0.7855234702963999, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.641127586364746, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8566147089004517, + "num_tokens": 235452516.0, + "step": 6175 + }, + { + "epoch": 0.7856506805749904, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5721988677978516, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.871146023273468, + "num_tokens": 235490306.0, + "step": 6176 + }, + { + "epoch": 0.785777890853581, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5397933721542358, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.859753429889679, + "num_tokens": 235530242.0, + "step": 6177 + }, + { + "epoch": 0.7859051011321715, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.7308776378631592, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8596022725105286, + "num_tokens": 235564606.0, + "step": 6178 + }, + { + "epoch": 0.786032311410762, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6236716508865356, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8673014640808105, + "num_tokens": 235601912.0, + "step": 6179 + }, + { + "epoch": 0.7861595216893525, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6610592603683472, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.861655056476593, + "num_tokens": 235639734.0, + "step": 6180 + }, + { + "epoch": 0.786286731967943, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.575658917427063, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8682447671890259, + "num_tokens": 235674114.0, + "step": 6181 + }, + { + "epoch": 0.7864139422465335, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.8006157875061035, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8567352890968323, + "num_tokens": 235708413.0, + "step": 6182 + }, + { + "epoch": 0.786541152525124, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.638677954673767, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8685340881347656, + "num_tokens": 235740733.0, + "step": 6183 + }, + { + "epoch": 0.7866683628037145, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5249837636947632, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8725418448448181, + "num_tokens": 235781193.0, + "step": 6184 + }, + { + "epoch": 0.7867955730823051, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.7715269327163696, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.857725977897644, + "num_tokens": 235818925.0, + "step": 6185 + }, + { + "epoch": 0.7869227833608956, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6290204524993896, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8445963263511658, + "num_tokens": 235858621.0, + "step": 6186 + }, + { + "epoch": 0.787049993639486, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6277333498001099, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8614622354507446, + "num_tokens": 235896240.0, + "step": 6187 + }, + { + "epoch": 0.7871772039180766, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.4367575645446777, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8788928389549255, + "num_tokens": 235937867.0, + "step": 6188 + }, + { + "epoch": 0.7873044141966671, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6607041358947754, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8751173615455627, + "num_tokens": 235972733.0, + "step": 6189 + }, + { + "epoch": 0.7874316244752576, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5760698318481445, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8759610652923584, + "num_tokens": 236010580.0, + "step": 6190 + }, + { + "epoch": 0.7875588347538481, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5628844499588013, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8654521107673645, + "num_tokens": 236050842.0, + "step": 6191 + }, + { + "epoch": 0.7876860450324387, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6677953004837036, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8796896934509277, + "num_tokens": 236086441.0, + "step": 6192 + }, + { + "epoch": 0.7878132553110291, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5260093212127686, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8697672486305237, + "num_tokens": 236125205.0, + "step": 6193 + }, + { + "epoch": 0.7879404655896196, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.604705572128296, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8692898154258728, + "num_tokens": 236161788.0, + "step": 6194 + }, + { + "epoch": 0.7880676758682101, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6022226810455322, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8708473443984985, + "num_tokens": 236203268.0, + "step": 6195 + }, + { + "epoch": 0.7881948861468007, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5302470922470093, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8742597699165344, + "num_tokens": 236241386.0, + "step": 6196 + }, + { + "epoch": 0.7883220964253912, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.7236354351043701, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8650907874107361, + "num_tokens": 236271231.0, + "step": 6197 + }, + { + "epoch": 0.7884493067039817, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.653839349746704, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8771084547042847, + "num_tokens": 236306569.0, + "step": 6198 + }, + { + "epoch": 0.7885765169825721, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.58504319190979, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.865933895111084, + "num_tokens": 236345441.0, + "step": 6199 + }, + { + "epoch": 0.7887037272611627, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5480093955993652, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8775180578231812, + "num_tokens": 236384452.0, + "step": 6200 + }, + { + "epoch": 0.7888309375397532, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.4701441526412964, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8819891214370728, + "num_tokens": 236426720.0, + "step": 6201 + }, + { + "epoch": 0.7889581478183437, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.645932912826538, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8593399524688721, + "num_tokens": 236464411.0, + "step": 6202 + }, + { + "epoch": 0.7890853580969343, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.4482336044311523, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8738174438476562, + "num_tokens": 236507966.0, + "step": 6203 + }, + { + "epoch": 0.7892125683755248, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.8618024587631226, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8618738055229187, + "num_tokens": 236540305.0, + "step": 6204 + }, + { + "epoch": 0.7893397786541152, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6218725442886353, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8524965643882751, + "num_tokens": 236582281.0, + "step": 6205 + }, + { + "epoch": 0.7894669889327057, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6274619102478027, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8552144765853882, + "num_tokens": 236620066.0, + "step": 6206 + }, + { + "epoch": 0.7895941992112963, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.7730510234832764, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.858100175857544, + "num_tokens": 236651836.0, + "step": 6207 + }, + { + "epoch": 0.7897214094898868, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5966366529464722, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8552737236022949, + "num_tokens": 236688331.0, + "step": 6208 + }, + { + "epoch": 0.7898486197684773, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5743099451065063, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.877139151096344, + "num_tokens": 236725609.0, + "step": 6209 + }, + { + "epoch": 0.7899758300470678, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5765380859375, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8579297065734863, + "num_tokens": 236765050.0, + "step": 6210 + }, + { + "epoch": 0.7901030403256584, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5997369289398193, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.860703706741333, + "num_tokens": 236806280.0, + "step": 6211 + }, + { + "epoch": 0.7902302506042488, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6165457963943481, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.870442807674408, + "num_tokens": 236844569.0, + "step": 6212 + }, + { + "epoch": 0.7903574608828393, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.675218939781189, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8607984781265259, + "num_tokens": 236880192.0, + "step": 6213 + }, + { + "epoch": 0.7904846711614298, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.7023897171020508, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8587862253189087, + "num_tokens": 236913614.0, + "step": 6214 + }, + { + "epoch": 0.7906118814400204, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.5956124067306519, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8708680868148804, + "num_tokens": 236957311.0, + "step": 6215 + }, + { + "epoch": 0.7907390917186109, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.7497156858444214, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8756049871444702, + "num_tokens": 236992903.0, + "step": 6216 + }, + { + "epoch": 0.7908663019972014, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6112672090530396, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8700319528579712, + "num_tokens": 237031813.0, + "step": 6217 + }, + { + "epoch": 0.7909935122757918, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.735288381576538, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8556920886039734, + "num_tokens": 237068124.0, + "step": 6218 + }, + { + "epoch": 0.7911207225543824, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6320565938949585, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8865389823913574, + "num_tokens": 237100402.0, + "step": 6219 + }, + { + "epoch": 0.7912479328329729, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.6417527198791504, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8660272359848022, + "num_tokens": 237137895.0, + "step": 6220 + }, + { + "epoch": 0.7913751431115634, + "ewc_loss": 1.7523765563964844e-05, + "grad_norm": 1.540426254272461, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8688207268714905, + "num_tokens": 237177888.0, + "step": 6221 + }, + { + "epoch": 0.791502353390154, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6627064943313599, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.857324481010437, + "num_tokens": 237213993.0, + "step": 6222 + }, + { + "epoch": 0.7916295636687445, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.839542269706726, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.841107964515686, + "num_tokens": 237249053.0, + "step": 6223 + }, + { + "epoch": 0.7917567739473349, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.7105762958526611, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8586838245391846, + "num_tokens": 237283322.0, + "step": 6224 + }, + { + "epoch": 0.7918839842259254, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6144664287567139, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8737497925758362, + "num_tokens": 237319274.0, + "step": 6225 + }, + { + "epoch": 0.792011194504516, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.7522519826889038, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8526566624641418, + "num_tokens": 237352411.0, + "step": 6226 + }, + { + "epoch": 0.7921384047831065, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.5269029140472412, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8604317903518677, + "num_tokens": 237396079.0, + "step": 6227 + }, + { + "epoch": 0.792265615061697, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6884621381759644, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.872319221496582, + "num_tokens": 237434519.0, + "step": 6228 + }, + { + "epoch": 0.7923928253402875, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.7395503520965576, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8687335252761841, + "num_tokens": 237468321.0, + "step": 6229 + }, + { + "epoch": 0.792520035618878, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6957000494003296, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.876265287399292, + "num_tokens": 237500386.0, + "step": 6230 + }, + { + "epoch": 0.7926472458974685, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.476545810699463, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8677034378051758, + "num_tokens": 237544596.0, + "step": 6231 + }, + { + "epoch": 0.792774456176059, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.672806739807129, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.867157518863678, + "num_tokens": 237578828.0, + "step": 6232 + }, + { + "epoch": 0.7929016664546495, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.704197883605957, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8591427803039551, + "num_tokens": 237616534.0, + "step": 6233 + }, + { + "epoch": 0.7930288767332401, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.7132295370101929, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8734225630760193, + "num_tokens": 237651136.0, + "step": 6234 + }, + { + "epoch": 0.7931560870118306, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.9190226793289185, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8694087266921997, + "num_tokens": 237680542.0, + "step": 6235 + }, + { + "epoch": 0.793283297290421, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.526183009147644, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8626527786254883, + "num_tokens": 237719816.0, + "step": 6236 + }, + { + "epoch": 0.7934105075690115, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6476267576217651, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8715803623199463, + "num_tokens": 237752241.0, + "step": 6237 + }, + { + "epoch": 0.7935377178476021, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5498303174972534, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8642026782035828, + "num_tokens": 237797089.0, + "step": 6238 + }, + { + "epoch": 0.7936649281261926, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4702785015106201, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8600208759307861, + "num_tokens": 237845068.0, + "step": 6239 + }, + { + "epoch": 0.7937921384047831, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.611598014831543, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8606617450714111, + "num_tokens": 237883769.0, + "step": 6240 + }, + { + "epoch": 0.7939193486833737, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.7641654014587402, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8435728549957275, + "num_tokens": 237921691.0, + "step": 6241 + }, + { + "epoch": 0.7940465589619641, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.5098823308944702, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.859299898147583, + "num_tokens": 237961523.0, + "step": 6242 + }, + { + "epoch": 0.7941737692405546, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6717621088027954, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8454477190971375, + "num_tokens": 237999220.0, + "step": 6243 + }, + { + "epoch": 0.7943009795191451, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.7522064447402954, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8597860336303711, + "num_tokens": 238032650.0, + "step": 6244 + }, + { + "epoch": 0.7944281897977357, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6414451599121094, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.869021475315094, + "num_tokens": 238067919.0, + "step": 6245 + }, + { + "epoch": 0.7945554000763262, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6098912954330444, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8680305480957031, + "num_tokens": 238102904.0, + "step": 6246 + }, + { + "epoch": 0.7946826103549167, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.5820214748382568, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8562451004981995, + "num_tokens": 238142566.0, + "step": 6247 + }, + { + "epoch": 0.7948098206335071, + "ewc_loss": 1.7642974853515625e-05, + "grad_norm": 1.6592364311218262, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8700876235961914, + "num_tokens": 238174959.0, + "step": 6248 + }, + { + "epoch": 0.7949370309120977, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5149741172790527, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8773598074913025, + "num_tokens": 238216952.0, + "step": 6249 + }, + { + "epoch": 0.7950642411906882, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.7136776447296143, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8498730659484863, + "num_tokens": 238256406.0, + "step": 6250 + }, + { + "epoch": 0.7951914514692787, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.546351671218872, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8628546595573425, + "num_tokens": 238296869.0, + "step": 6251 + }, + { + "epoch": 0.7953186617478692, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.604642391204834, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8630675077438354, + "num_tokens": 238335777.0, + "step": 6252 + }, + { + "epoch": 0.7954458720264598, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6636180877685547, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8664356470108032, + "num_tokens": 238373561.0, + "step": 6253 + }, + { + "epoch": 0.7955730823050502, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5260196924209595, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8721357583999634, + "num_tokens": 238412271.0, + "step": 6254 + }, + { + "epoch": 0.7957002925836407, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5942412614822388, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8609565496444702, + "num_tokens": 238451640.0, + "step": 6255 + }, + { + "epoch": 0.7958275028622313, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4364458322525024, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8685636520385742, + "num_tokens": 238496469.0, + "step": 6256 + }, + { + "epoch": 0.7959547131408218, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.717193365097046, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8453552722930908, + "num_tokens": 238531353.0, + "step": 6257 + }, + { + "epoch": 0.7960819234194123, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5116546154022217, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8585184216499329, + "num_tokens": 238572158.0, + "step": 6258 + }, + { + "epoch": 0.7962091336980028, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5632667541503906, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8665151596069336, + "num_tokens": 238610294.0, + "step": 6259 + }, + { + "epoch": 0.7963363439765934, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5717486143112183, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8745507001876831, + "num_tokens": 238652188.0, + "step": 6260 + }, + { + "epoch": 0.7964635542551838, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.2043328285217285, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8666867017745972, + "num_tokens": 238690542.0, + "step": 6261 + }, + { + "epoch": 0.7965907645337743, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.638580322265625, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8722127676010132, + "num_tokens": 238725344.0, + "step": 6262 + }, + { + "epoch": 0.7967179748123648, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.480180025100708, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8400652408599854, + "num_tokens": 238773251.0, + "step": 6263 + }, + { + "epoch": 0.7968451850909554, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4526952505111694, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8732198476791382, + "num_tokens": 238813930.0, + "step": 6264 + }, + { + "epoch": 0.7969723953695459, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.569196105003357, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8629376888275146, + "num_tokens": 238851433.0, + "step": 6265 + }, + { + "epoch": 0.7970996056481364, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.604686975479126, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.871898889541626, + "num_tokens": 238888437.0, + "step": 6266 + }, + { + "epoch": 0.7972268159267268, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4388084411621094, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8740971684455872, + "num_tokens": 238929002.0, + "step": 6267 + }, + { + "epoch": 0.7973540262053174, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5458762645721436, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8638288378715515, + "num_tokens": 238971953.0, + "step": 6268 + }, + { + "epoch": 0.7974812364839079, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.571243405342102, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8673838973045349, + "num_tokens": 239010661.0, + "step": 6269 + }, + { + "epoch": 0.7976084467624984, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5780621767044067, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8617448806762695, + "num_tokens": 239050225.0, + "step": 6270 + }, + { + "epoch": 0.797735657041089, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.686892032623291, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8569968938827515, + "num_tokens": 239082912.0, + "step": 6271 + }, + { + "epoch": 0.7978628673196795, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.616431474685669, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8751660585403442, + "num_tokens": 239115972.0, + "step": 6272 + }, + { + "epoch": 0.7979900775982699, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5638896226882935, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.851806640625, + "num_tokens": 239153707.0, + "step": 6273 + }, + { + "epoch": 0.7981172878768604, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.51705002784729, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8615059852600098, + "num_tokens": 239194427.0, + "step": 6274 + }, + { + "epoch": 0.798244498155451, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.46910560131073, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8746469616889954, + "num_tokens": 239234829.0, + "step": 6275 + }, + { + "epoch": 0.7983717084340415, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4848127365112305, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8825235366821289, + "num_tokens": 239273603.0, + "step": 6276 + }, + { + "epoch": 0.798498918712632, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4847400188446045, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8638135194778442, + "num_tokens": 239315222.0, + "step": 6277 + }, + { + "epoch": 0.7986261289912225, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6839916706085205, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8462596535682678, + "num_tokens": 239352354.0, + "step": 6278 + }, + { + "epoch": 0.798753339269813, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6003628969192505, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8600741624832153, + "num_tokens": 239388511.0, + "step": 6279 + }, + { + "epoch": 0.7988805495484035, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6908529996871948, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8538915514945984, + "num_tokens": 239422124.0, + "step": 6280 + }, + { + "epoch": 0.799007759826994, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6683845520019531, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8574241995811462, + "num_tokens": 239459292.0, + "step": 6281 + }, + { + "epoch": 0.7991349701055845, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.8367283344268799, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8484526872634888, + "num_tokens": 239492660.0, + "step": 6282 + }, + { + "epoch": 0.7992621803841751, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5390011072158813, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8769057989120483, + "num_tokens": 239531377.0, + "step": 6283 + }, + { + "epoch": 0.7993893906627656, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.432181477546692, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8790290355682373, + "num_tokens": 239573311.0, + "step": 6284 + }, + { + "epoch": 0.799516600941356, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5075349807739258, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.867741048336029, + "num_tokens": 239613293.0, + "step": 6285 + }, + { + "epoch": 0.7996438112199465, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5611425638198853, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8745700120925903, + "num_tokens": 239652219.0, + "step": 6286 + }, + { + "epoch": 0.7997710214985371, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5996934175491333, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8583231568336487, + "num_tokens": 239688538.0, + "step": 6287 + }, + { + "epoch": 0.7998982317771276, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.586310625076294, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8522146344184875, + "num_tokens": 239727895.0, + "step": 6288 + }, + { + "epoch": 0.8000254420557181, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5138720273971558, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8518432378768921, + "num_tokens": 239770427.0, + "step": 6289 + }, + { + "epoch": 0.8001526523343087, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6414133310317993, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8631792068481445, + "num_tokens": 239809150.0, + "step": 6290 + }, + { + "epoch": 0.8002798626128991, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4518011808395386, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8665924668312073, + "num_tokens": 239850263.0, + "step": 6291 + }, + { + "epoch": 0.8004070728914896, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5516223907470703, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8690458536148071, + "num_tokens": 239890040.0, + "step": 6292 + }, + { + "epoch": 0.8005342831700801, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.460433006286621, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8719356060028076, + "num_tokens": 239933379.0, + "step": 6293 + }, + { + "epoch": 0.8006614934486707, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5518666505813599, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8639187812805176, + "num_tokens": 239971889.0, + "step": 6294 + }, + { + "epoch": 0.8007887037272612, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5484977960586548, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8642878532409668, + "num_tokens": 240008866.0, + "step": 6295 + }, + { + "epoch": 0.8009159140058517, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4278584718704224, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8693439960479736, + "num_tokens": 240054747.0, + "step": 6296 + }, + { + "epoch": 0.8010431242844421, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5855283737182617, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8663465976715088, + "num_tokens": 240091673.0, + "step": 6297 + }, + { + "epoch": 0.8011703345630327, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6767668724060059, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8627831339836121, + "num_tokens": 240128571.0, + "step": 6298 + }, + { + "epoch": 0.8012975448416232, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.585021734237671, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8631000518798828, + "num_tokens": 240169489.0, + "step": 6299 + }, + { + "epoch": 0.8014247551202137, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.7286864519119263, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.873831033706665, + "num_tokens": 240204001.0, + "step": 6300 + }, + { + "epoch": 0.8015519653988042, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5756016969680786, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8728468418121338, + "num_tokens": 240242853.0, + "step": 6301 + }, + { + "epoch": 0.8016791756773948, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6855401992797852, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8642606139183044, + "num_tokens": 240278901.0, + "step": 6302 + }, + { + "epoch": 0.8018063859559852, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4692890644073486, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8766452670097351, + "num_tokens": 240314827.0, + "step": 6303 + }, + { + "epoch": 0.8019335962345757, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5953565835952759, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8661009669303894, + "num_tokens": 240351162.0, + "step": 6304 + }, + { + "epoch": 0.8020608065131662, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5352011919021606, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8571852445602417, + "num_tokens": 240393417.0, + "step": 6305 + }, + { + "epoch": 0.8021880167917568, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.8138878345489502, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8460018634796143, + "num_tokens": 240426785.0, + "step": 6306 + }, + { + "epoch": 0.8023152270703473, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4384369850158691, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8681053519248962, + "num_tokens": 240467654.0, + "step": 6307 + }, + { + "epoch": 0.8024424373489378, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.665775179862976, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8601254820823669, + "num_tokens": 240504955.0, + "step": 6308 + }, + { + "epoch": 0.8025696476275284, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.8024210929870605, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8775361776351929, + "num_tokens": 240533919.0, + "step": 6309 + }, + { + "epoch": 0.8026968579061188, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6506261825561523, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.868202269077301, + "num_tokens": 240565835.0, + "step": 6310 + }, + { + "epoch": 0.8028240681847093, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5843147039413452, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8648788332939148, + "num_tokens": 240601129.0, + "step": 6311 + }, + { + "epoch": 0.8029512784632998, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.7126259803771973, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8539311289787292, + "num_tokens": 240636007.0, + "step": 6312 + }, + { + "epoch": 0.8030784887418904, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.648300290107727, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8759886026382446, + "num_tokens": 240674059.0, + "step": 6313 + }, + { + "epoch": 0.8032056990204809, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5837095975875854, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8727962374687195, + "num_tokens": 240712013.0, + "step": 6314 + }, + { + "epoch": 0.8033329092990714, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 16.820287704467773, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8759281635284424, + "num_tokens": 240748194.0, + "step": 6315 + }, + { + "epoch": 0.8034601195776618, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.450105905532837, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8561607599258423, + "num_tokens": 240793187.0, + "step": 6316 + }, + { + "epoch": 0.8035873298562524, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5778508186340332, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8591794967651367, + "num_tokens": 240835973.0, + "step": 6317 + }, + { + "epoch": 0.8037145401348429, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6369683742523193, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8692759871482849, + "num_tokens": 240871772.0, + "step": 6318 + }, + { + "epoch": 0.8038417504134334, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5762271881103516, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8601804971694946, + "num_tokens": 240912249.0, + "step": 6319 + }, + { + "epoch": 0.803968960692024, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5326281785964966, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8655710220336914, + "num_tokens": 240953137.0, + "step": 6320 + }, + { + "epoch": 0.8040961709706145, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4594374895095825, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8714756965637207, + "num_tokens": 241001226.0, + "step": 6321 + }, + { + "epoch": 0.8042233812492049, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6533076763153076, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8683807253837585, + "num_tokens": 241036006.0, + "step": 6322 + }, + { + "epoch": 0.8043505915277954, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4178040027618408, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8713169097900391, + "num_tokens": 241077819.0, + "step": 6323 + }, + { + "epoch": 0.804477801806386, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5499756336212158, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8634896278381348, + "num_tokens": 241115507.0, + "step": 6324 + }, + { + "epoch": 0.8046050120849765, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6177403926849365, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.871739387512207, + "num_tokens": 241154818.0, + "step": 6325 + }, + { + "epoch": 0.804732222363567, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5674375295639038, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8591746687889099, + "num_tokens": 241193357.0, + "step": 6326 + }, + { + "epoch": 0.8048594326421575, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4996364116668701, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8697319030761719, + "num_tokens": 241232268.0, + "step": 6327 + }, + { + "epoch": 0.804986642920748, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5702407360076904, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8539733290672302, + "num_tokens": 241270607.0, + "step": 6328 + }, + { + "epoch": 0.8051138531993385, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5001575946807861, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8812334537506104, + "num_tokens": 241310997.0, + "step": 6329 + }, + { + "epoch": 0.805241063477929, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.679522156715393, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8788599967956543, + "num_tokens": 241346057.0, + "step": 6330 + }, + { + "epoch": 0.8053682737565195, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6893504858016968, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8565583229064941, + "num_tokens": 241383988.0, + "step": 6331 + }, + { + "epoch": 0.8054954840351101, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.7019340991973877, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8589941263198853, + "num_tokens": 241419732.0, + "step": 6332 + }, + { + "epoch": 0.8056226943137006, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.560071587562561, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8649344444274902, + "num_tokens": 241458688.0, + "step": 6333 + }, + { + "epoch": 0.805749904592291, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6713992357254028, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8748589158058167, + "num_tokens": 241492304.0, + "step": 6334 + }, + { + "epoch": 0.8058771148708815, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5830459594726562, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8675659894943237, + "num_tokens": 241529659.0, + "step": 6335 + }, + { + "epoch": 0.8060043251494721, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5663173198699951, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8620595335960388, + "num_tokens": 241573291.0, + "step": 6336 + }, + { + "epoch": 0.8061315354280626, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5843983888626099, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8654137849807739, + "num_tokens": 241614755.0, + "step": 6337 + }, + { + "epoch": 0.8062587457066531, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6254181861877441, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8584016561508179, + "num_tokens": 241652301.0, + "step": 6338 + }, + { + "epoch": 0.8063859559852437, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5795648097991943, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8518974184989929, + "num_tokens": 241695939.0, + "step": 6339 + }, + { + "epoch": 0.8065131662638341, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5521786212921143, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8741315603256226, + "num_tokens": 241730670.0, + "step": 6340 + }, + { + "epoch": 0.8066403765424246, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.7693283557891846, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8543367385864258, + "num_tokens": 241764528.0, + "step": 6341 + }, + { + "epoch": 0.8067675868210151, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4985685348510742, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8751636147499084, + "num_tokens": 241805881.0, + "step": 6342 + }, + { + "epoch": 0.8068947970996057, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.604618787765503, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8554397821426392, + "num_tokens": 241846141.0, + "step": 6343 + }, + { + "epoch": 0.8070220073781962, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5096701383590698, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8658055067062378, + "num_tokens": 241886970.0, + "step": 6344 + }, + { + "epoch": 0.8071492176567867, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5255879163742065, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.870710015296936, + "num_tokens": 241928722.0, + "step": 6345 + }, + { + "epoch": 0.8072764279353771, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.757224202156067, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8679689764976501, + "num_tokens": 241967176.0, + "step": 6346 + }, + { + "epoch": 0.8074036382139677, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6251956224441528, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8606749176979065, + "num_tokens": 242004201.0, + "step": 6347 + }, + { + "epoch": 0.8075308484925582, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4977922439575195, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8686916828155518, + "num_tokens": 242043571.0, + "step": 6348 + }, + { + "epoch": 0.8076580587711487, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5264580249786377, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8589218854904175, + "num_tokens": 242083637.0, + "step": 6349 + }, + { + "epoch": 0.8077852690497392, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.511916160583496, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8754826784133911, + "num_tokens": 242122516.0, + "step": 6350 + }, + { + "epoch": 0.8079124793283298, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4320151805877686, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.879427433013916, + "num_tokens": 242162990.0, + "step": 6351 + }, + { + "epoch": 0.8080396896069202, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6971012353897095, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.857900857925415, + "num_tokens": 242200635.0, + "step": 6352 + }, + { + "epoch": 0.8081668998855107, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5593361854553223, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8651713132858276, + "num_tokens": 242238005.0, + "step": 6353 + }, + { + "epoch": 0.8082941101641012, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5207479000091553, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8539025783538818, + "num_tokens": 242281314.0, + "step": 6354 + }, + { + "epoch": 0.8084213204426918, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.7361725568771362, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8591428995132446, + "num_tokens": 242314848.0, + "step": 6355 + }, + { + "epoch": 0.8085485307212823, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6056877374649048, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8524630665779114, + "num_tokens": 242356304.0, + "step": 6356 + }, + { + "epoch": 0.8086757409998728, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5832126140594482, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8647778034210205, + "num_tokens": 242395659.0, + "step": 6357 + }, + { + "epoch": 0.8088029512784632, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5525356531143188, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8734656572341919, + "num_tokens": 242433103.0, + "step": 6358 + }, + { + "epoch": 0.8089301615570538, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6115871667861938, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8541752099990845, + "num_tokens": 242470626.0, + "step": 6359 + }, + { + "epoch": 0.8090573718356443, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6253055334091187, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8580574989318848, + "num_tokens": 242507768.0, + "step": 6360 + }, + { + "epoch": 0.8091845821142348, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.560198426246643, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8698363304138184, + "num_tokens": 242544777.0, + "step": 6361 + }, + { + "epoch": 0.8093117923928254, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5680350065231323, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8530400991439819, + "num_tokens": 242585112.0, + "step": 6362 + }, + { + "epoch": 0.8094390026714159, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.48270583152771, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8508956432342529, + "num_tokens": 242630002.0, + "step": 6363 + }, + { + "epoch": 0.8095662129500064, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.750640630722046, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8635210394859314, + "num_tokens": 242666006.0, + "step": 6364 + }, + { + "epoch": 0.8096934232285968, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.4879896640777588, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8675165176391602, + "num_tokens": 242706094.0, + "step": 6365 + }, + { + "epoch": 0.8098206335071874, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.637689232826233, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8569433689117432, + "num_tokens": 242746412.0, + "step": 6366 + }, + { + "epoch": 0.8099478437857779, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6136713027954102, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8540945649147034, + "num_tokens": 242781948.0, + "step": 6367 + }, + { + "epoch": 0.8100750540643684, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6625763177871704, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8634746074676514, + "num_tokens": 242819919.0, + "step": 6368 + }, + { + "epoch": 0.8102022643429589, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 2.0160491466522217, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8622764945030212, + "num_tokens": 242850782.0, + "step": 6369 + }, + { + "epoch": 0.8103294746215495, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6368653774261475, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8770233988761902, + "num_tokens": 242883490.0, + "step": 6370 + }, + { + "epoch": 0.8104566849001399, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6307086944580078, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.877656877040863, + "num_tokens": 242916154.0, + "step": 6371 + }, + { + "epoch": 0.8105838951787304, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5264451503753662, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8765785694122314, + "num_tokens": 242954872.0, + "step": 6372 + }, + { + "epoch": 0.810711105457321, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6727526187896729, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8594879508018494, + "num_tokens": 242990351.0, + "step": 6373 + }, + { + "epoch": 0.8108383157359115, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6117318868637085, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8716723918914795, + "num_tokens": 243028801.0, + "step": 6374 + }, + { + "epoch": 0.810965526014502, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6850359439849854, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.856338381767273, + "num_tokens": 243064349.0, + "step": 6375 + }, + { + "epoch": 0.8110927362930925, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.594151258468628, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8608664870262146, + "num_tokens": 243103792.0, + "step": 6376 + }, + { + "epoch": 0.811219946571683, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6796976327896118, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.838172435760498, + "num_tokens": 243144047.0, + "step": 6377 + }, + { + "epoch": 0.8113471568502735, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5415862798690796, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8579580783843994, + "num_tokens": 243184251.0, + "step": 6378 + }, + { + "epoch": 0.811474367128864, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5831478834152222, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.862492561340332, + "num_tokens": 243222851.0, + "step": 6379 + }, + { + "epoch": 0.8116015774074545, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5860177278518677, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8580106496810913, + "num_tokens": 243260330.0, + "step": 6380 + }, + { + "epoch": 0.8117287876860451, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.6218795776367188, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8790656328201294, + "num_tokens": 243292147.0, + "step": 6381 + }, + { + "epoch": 0.8118559979646356, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5406736135482788, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8768564462661743, + "num_tokens": 243328800.0, + "step": 6382 + }, + { + "epoch": 0.811983208243226, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7206989526748657, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8484026193618774, + "num_tokens": 243365825.0, + "step": 6383 + }, + { + "epoch": 0.8121104185218165, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.4667760133743286, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8732215166091919, + "num_tokens": 243405652.0, + "step": 6384 + }, + { + "epoch": 0.8122376288004071, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5058236122131348, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8682867288589478, + "num_tokens": 243448855.0, + "step": 6385 + }, + { + "epoch": 0.8123648390789976, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5950062274932861, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8655375242233276, + "num_tokens": 243485019.0, + "step": 6386 + }, + { + "epoch": 0.8124920493575881, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5749211311340332, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8752702474594116, + "num_tokens": 243521145.0, + "step": 6387 + }, + { + "epoch": 0.8126192596361786, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.4707281589508057, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8633480072021484, + "num_tokens": 243564001.0, + "step": 6388 + }, + { + "epoch": 0.8127464699147691, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6303261518478394, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8781242370605469, + "num_tokens": 243600235.0, + "step": 6389 + }, + { + "epoch": 0.8128736801933596, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6175965070724487, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8644397258758545, + "num_tokens": 243635871.0, + "step": 6390 + }, + { + "epoch": 0.8130008904719501, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5323355197906494, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8674054145812988, + "num_tokens": 243674217.0, + "step": 6391 + }, + { + "epoch": 0.8131281007505406, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6050457954406738, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8535497188568115, + "num_tokens": 243712657.0, + "step": 6392 + }, + { + "epoch": 0.8132553110291312, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.461698055267334, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8673084378242493, + "num_tokens": 243754180.0, + "step": 6393 + }, + { + "epoch": 0.8133825213077217, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5642908811569214, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8656238317489624, + "num_tokens": 243791295.0, + "step": 6394 + }, + { + "epoch": 0.8135097315863121, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7642027139663696, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.858168363571167, + "num_tokens": 243824934.0, + "step": 6395 + }, + { + "epoch": 0.8136369418649027, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5629152059555054, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8589146137237549, + "num_tokens": 243863823.0, + "step": 6396 + }, + { + "epoch": 0.8137641521434932, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5729390382766724, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8694632053375244, + "num_tokens": 243901754.0, + "step": 6397 + }, + { + "epoch": 0.8138913624220837, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7447725534439087, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8652108907699585, + "num_tokens": 243934751.0, + "step": 6398 + }, + { + "epoch": 0.8140185727006742, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6096997261047363, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8770806193351746, + "num_tokens": 243970045.0, + "step": 6399 + }, + { + "epoch": 0.8141457829792648, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.631220817565918, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8692116737365723, + "num_tokens": 244006612.0, + "step": 6400 + }, + { + "epoch": 0.8142729932578552, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5851455926895142, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8604570031166077, + "num_tokens": 244046572.0, + "step": 6401 + }, + { + "epoch": 0.8144002035364457, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6339083909988403, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8735637664794922, + "num_tokens": 244081859.0, + "step": 6402 + }, + { + "epoch": 0.8145274138150362, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6991808414459229, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8631731271743774, + "num_tokens": 244114209.0, + "step": 6403 + }, + { + "epoch": 0.8146546240936268, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.543431043624878, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8586106896400452, + "num_tokens": 244155059.0, + "step": 6404 + }, + { + "epoch": 0.8147818343722173, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7639360427856445, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8702678680419922, + "num_tokens": 244193798.0, + "step": 6405 + }, + { + "epoch": 0.8149090446508078, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.8388574123382568, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.856618344783783, + "num_tokens": 244229542.0, + "step": 6406 + }, + { + "epoch": 0.8150362549293982, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.65339994430542, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8607078790664673, + "num_tokens": 244266927.0, + "step": 6407 + }, + { + "epoch": 0.8151634652079888, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6348357200622559, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8681048154830933, + "num_tokens": 244306097.0, + "step": 6408 + }, + { + "epoch": 0.8152906754865793, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5814043283462524, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8674355745315552, + "num_tokens": 244345748.0, + "step": 6409 + }, + { + "epoch": 0.8154178857651698, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7415422201156616, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8487254977226257, + "num_tokens": 244379956.0, + "step": 6410 + }, + { + "epoch": 0.8155450960437604, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.8198792934417725, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8538421392440796, + "num_tokens": 244418709.0, + "step": 6411 + }, + { + "epoch": 0.8156723063223509, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.725499153137207, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.863019585609436, + "num_tokens": 244455664.0, + "step": 6412 + }, + { + "epoch": 0.8157995166009414, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5638759136199951, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8597197532653809, + "num_tokens": 244493249.0, + "step": 6413 + }, + { + "epoch": 0.8159267268795318, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6438692808151245, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8639400601387024, + "num_tokens": 244530653.0, + "step": 6414 + }, + { + "epoch": 0.8160539371581224, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.4766159057617188, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8760610818862915, + "num_tokens": 244575190.0, + "step": 6415 + }, + { + "epoch": 0.8161811474367129, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5593937635421753, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8649336099624634, + "num_tokens": 244618765.0, + "step": 6416 + }, + { + "epoch": 0.8163083577153034, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.545972228050232, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8644676208496094, + "num_tokens": 244664122.0, + "step": 6417 + }, + { + "epoch": 0.8164355679938939, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.704621434211731, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8639465570449829, + "num_tokens": 244700359.0, + "step": 6418 + }, + { + "epoch": 0.8165627782724845, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5649110078811646, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8565890789031982, + "num_tokens": 244741801.0, + "step": 6419 + }, + { + "epoch": 0.8166899885510749, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5304484367370605, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8790997266769409, + "num_tokens": 244781675.0, + "step": 6420 + }, + { + "epoch": 0.8168171988296654, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6497238874435425, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8625777959823608, + "num_tokens": 244818633.0, + "step": 6421 + }, + { + "epoch": 0.8169444091082559, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.5098637342453003, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8708459734916687, + "num_tokens": 244855135.0, + "step": 6422 + }, + { + "epoch": 0.8170716193868465, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5594905614852905, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8694944381713867, + "num_tokens": 244890128.0, + "step": 6423 + }, + { + "epoch": 0.817198829665437, + "ewc_loss": 1.7762184143066406e-05, + "grad_norm": 1.542752742767334, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8568862676620483, + "num_tokens": 244928678.0, + "step": 6424 + }, + { + "epoch": 0.8173260399440275, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6061394214630127, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8509258031845093, + "num_tokens": 244967095.0, + "step": 6425 + }, + { + "epoch": 0.8174532502226179, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5548070669174194, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8670123815536499, + "num_tokens": 245006188.0, + "step": 6426 + }, + { + "epoch": 0.8175804605012085, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6294877529144287, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8711180686950684, + "num_tokens": 245041381.0, + "step": 6427 + }, + { + "epoch": 0.817707670779799, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7345175743103027, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8709492683410645, + "num_tokens": 245073494.0, + "step": 6428 + }, + { + "epoch": 0.8178348810583895, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.4920588731765747, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8575696349143982, + "num_tokens": 245115546.0, + "step": 6429 + }, + { + "epoch": 0.8179620913369801, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5532230138778687, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8728280067443848, + "num_tokens": 245152203.0, + "step": 6430 + }, + { + "epoch": 0.8180893016155706, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.8345857858657837, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8444989919662476, + "num_tokens": 245182800.0, + "step": 6431 + }, + { + "epoch": 0.818216511894161, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.648216962814331, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8680776357650757, + "num_tokens": 245218278.0, + "step": 6432 + }, + { + "epoch": 0.8183437221727515, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5168057680130005, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8693587779998779, + "num_tokens": 245262750.0, + "step": 6433 + }, + { + "epoch": 0.8184709324513421, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5641961097717285, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8754579424858093, + "num_tokens": 245297539.0, + "step": 6434 + }, + { + "epoch": 0.8185981427299326, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5349701642990112, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8687976598739624, + "num_tokens": 245334988.0, + "step": 6435 + }, + { + "epoch": 0.8187253530085231, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6322205066680908, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8725486397743225, + "num_tokens": 245372956.0, + "step": 6436 + }, + { + "epoch": 0.8188525632871136, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.8905502557754517, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8497363328933716, + "num_tokens": 245403094.0, + "step": 6437 + }, + { + "epoch": 0.8189797735657041, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7362110614776611, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8683837056159973, + "num_tokens": 245435649.0, + "step": 6438 + }, + { + "epoch": 0.8191069838442946, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.396760106086731, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8709479570388794, + "num_tokens": 245479428.0, + "step": 6439 + }, + { + "epoch": 0.8192341941228851, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.4909952878952026, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8542546033859253, + "num_tokens": 245520552.0, + "step": 6440 + }, + { + "epoch": 0.8193614044014756, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6316014528274536, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8610945343971252, + "num_tokens": 245556979.0, + "step": 6441 + }, + { + "epoch": 0.8194886146800662, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5852019786834717, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8741905093193054, + "num_tokens": 245593617.0, + "step": 6442 + }, + { + "epoch": 0.8196158249586567, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5891928672790527, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.871849775314331, + "num_tokens": 245627481.0, + "step": 6443 + }, + { + "epoch": 0.8197430352372471, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.648140549659729, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.850040078163147, + "num_tokens": 245664255.0, + "step": 6444 + }, + { + "epoch": 0.8198702455158376, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.546351432800293, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8655626773834229, + "num_tokens": 245704453.0, + "step": 6445 + }, + { + "epoch": 0.8199974557944282, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7439128160476685, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8510415554046631, + "num_tokens": 245740841.0, + "step": 6446 + }, + { + "epoch": 0.8201246660730187, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5286295413970947, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8686316013336182, + "num_tokens": 245779187.0, + "step": 6447 + }, + { + "epoch": 0.8202518763516092, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5067888498306274, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8803832530975342, + "num_tokens": 245821732.0, + "step": 6448 + }, + { + "epoch": 0.8203790866301998, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5951757431030273, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8699817657470703, + "num_tokens": 245859349.0, + "step": 6449 + }, + { + "epoch": 0.8205062969087902, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5210134983062744, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.854206919670105, + "num_tokens": 245898974.0, + "step": 6450 + }, + { + "epoch": 0.8206335071873807, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6456680297851562, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8619545698165894, + "num_tokens": 245935410.0, + "step": 6451 + }, + { + "epoch": 0.8207607174659712, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5361822843551636, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8784576654434204, + "num_tokens": 245971685.0, + "step": 6452 + }, + { + "epoch": 0.8208879277445618, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.8060485124588013, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8544394969940186, + "num_tokens": 246004714.0, + "step": 6453 + }, + { + "epoch": 0.8210151380231523, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6181083917617798, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.850270688533783, + "num_tokens": 246046539.0, + "step": 6454 + }, + { + "epoch": 0.8211423483017428, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6409138441085815, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8536419868469238, + "num_tokens": 246086563.0, + "step": 6455 + }, + { + "epoch": 0.8212695585803332, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4971033334732056, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8776342868804932, + "num_tokens": 246125206.0, + "step": 6456 + }, + { + "epoch": 0.8213967688589238, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6967473030090332, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8605115413665771, + "num_tokens": 246162175.0, + "step": 6457 + }, + { + "epoch": 0.8215239791375143, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5111802816390991, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.874674379825592, + "num_tokens": 246200827.0, + "step": 6458 + }, + { + "epoch": 0.8216511894161048, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6743617057800293, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8675596714019775, + "num_tokens": 246238127.0, + "step": 6459 + }, + { + "epoch": 0.8217783996946953, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.445328712463379, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8762971758842468, + "num_tokens": 246279477.0, + "step": 6460 + }, + { + "epoch": 0.8219056099732859, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4971307516098022, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8665227890014648, + "num_tokens": 246316932.0, + "step": 6461 + }, + { + "epoch": 0.8220328202518764, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5645784139633179, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8547296524047852, + "num_tokens": 246361324.0, + "step": 6462 + }, + { + "epoch": 0.8221600305304668, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5696707963943481, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.857507586479187, + "num_tokens": 246402666.0, + "step": 6463 + }, + { + "epoch": 0.8222872408090574, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6196500062942505, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.875885009765625, + "num_tokens": 246436163.0, + "step": 6464 + }, + { + "epoch": 0.8224144510876479, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5032583475112915, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8707238435745239, + "num_tokens": 246476406.0, + "step": 6465 + }, + { + "epoch": 0.8225416613662384, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.7223600149154663, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8499677181243896, + "num_tokens": 246512570.0, + "step": 6466 + }, + { + "epoch": 0.8226688716448289, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.503543734550476, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8577930927276611, + "num_tokens": 246553347.0, + "step": 6467 + }, + { + "epoch": 0.8227960819234195, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5984704494476318, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8694649934768677, + "num_tokens": 246590996.0, + "step": 6468 + }, + { + "epoch": 0.8229232922020099, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5052309036254883, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8761433362960815, + "num_tokens": 246628298.0, + "step": 6469 + }, + { + "epoch": 0.8230505024806004, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5684031248092651, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8675301671028137, + "num_tokens": 246664475.0, + "step": 6470 + }, + { + "epoch": 0.8231777127591909, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5305156707763672, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8514475226402283, + "num_tokens": 246707871.0, + "step": 6471 + }, + { + "epoch": 0.8233049230377815, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.4751613140106201, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8734130859375, + "num_tokens": 246748385.0, + "step": 6472 + }, + { + "epoch": 0.823432133316372, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5466171503067017, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.865415632724762, + "num_tokens": 246785798.0, + "step": 6473 + }, + { + "epoch": 0.8235593435949625, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.8761305809020996, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8584467172622681, + "num_tokens": 246815634.0, + "step": 6474 + }, + { + "epoch": 0.8236865538735529, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.6225132942199707, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.868742048740387, + "num_tokens": 246849490.0, + "step": 6475 + }, + { + "epoch": 0.8238137641521435, + "ewc_loss": 1.7881393432617188e-05, + "grad_norm": 1.5629483461380005, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8703855276107788, + "num_tokens": 246886705.0, + "step": 6476 + }, + { + "epoch": 0.823940974430734, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6976827383041382, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8589367866516113, + "num_tokens": 246919024.0, + "step": 6477 + }, + { + "epoch": 0.8240681847093245, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6483893394470215, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8702508211135864, + "num_tokens": 246951954.0, + "step": 6478 + }, + { + "epoch": 0.824195394987915, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6193939447402954, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8607529401779175, + "num_tokens": 246992198.0, + "step": 6479 + }, + { + "epoch": 0.8243226052665056, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5476118326187134, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8804990649223328, + "num_tokens": 247033461.0, + "step": 6480 + }, + { + "epoch": 0.824449815545096, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4657126665115356, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8641117215156555, + "num_tokens": 247076328.0, + "step": 6481 + }, + { + "epoch": 0.8245770258236865, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.560783863067627, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8585295081138611, + "num_tokens": 247113773.0, + "step": 6482 + }, + { + "epoch": 0.824704236102277, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5106981992721558, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8681347370147705, + "num_tokens": 247151086.0, + "step": 6483 + }, + { + "epoch": 0.8248314463808676, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.690323829650879, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8515470027923584, + "num_tokens": 247185554.0, + "step": 6484 + }, + { + "epoch": 0.8249586566594581, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.723331093788147, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8396082520484924, + "num_tokens": 247222630.0, + "step": 6485 + }, + { + "epoch": 0.8250858669380486, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5963449478149414, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8833075165748596, + "num_tokens": 247260705.0, + "step": 6486 + }, + { + "epoch": 0.8252130772166391, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5299934148788452, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8763623237609863, + "num_tokens": 247297807.0, + "step": 6487 + }, + { + "epoch": 0.8253402874952296, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6682337522506714, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8725842237472534, + "num_tokens": 247331284.0, + "step": 6488 + }, + { + "epoch": 0.8254674977738201, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5791152715682983, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8637819886207581, + "num_tokens": 247373212.0, + "step": 6489 + }, + { + "epoch": 0.8255947080524106, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5007719993591309, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8636289834976196, + "num_tokens": 247412133.0, + "step": 6490 + }, + { + "epoch": 0.8257219183310012, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.375125527381897, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8722665309906006, + "num_tokens": 247456698.0, + "step": 6491 + }, + { + "epoch": 0.8258491286095917, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4932324886322021, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8663386702537537, + "num_tokens": 247496876.0, + "step": 6492 + }, + { + "epoch": 0.8259763388881821, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4764484167099, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8865714073181152, + "num_tokens": 247530681.0, + "step": 6493 + }, + { + "epoch": 0.8261035491667726, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.537047028541565, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8684557676315308, + "num_tokens": 247568409.0, + "step": 6494 + }, + { + "epoch": 0.8262307594453632, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6109117269515991, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8752278089523315, + "num_tokens": 247607972.0, + "step": 6495 + }, + { + "epoch": 0.8263579697239537, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6049952507019043, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8602341413497925, + "num_tokens": 247645118.0, + "step": 6496 + }, + { + "epoch": 0.8264851800025442, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5706998109817505, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8706004023551941, + "num_tokens": 247680267.0, + "step": 6497 + }, + { + "epoch": 0.8266123902811348, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5781049728393555, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8576143383979797, + "num_tokens": 247719219.0, + "step": 6498 + }, + { + "epoch": 0.8267396005597252, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4951316118240356, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8730018734931946, + "num_tokens": 247756860.0, + "step": 6499 + }, + { + "epoch": 0.8268668108383157, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5424699783325195, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8621172904968262, + "num_tokens": 247797087.0, + "step": 6500 + }, + { + "epoch": 0.8269940211169062, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.730046033859253, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8588539361953735, + "num_tokens": 247830476.0, + "step": 6501 + }, + { + "epoch": 0.8271212313954968, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4944945573806763, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8733593225479126, + "num_tokens": 247870621.0, + "step": 6502 + }, + { + "epoch": 0.8272484416740873, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5511759519577026, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8607906103134155, + "num_tokens": 247907259.0, + "step": 6503 + }, + { + "epoch": 0.8273756519526778, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5489540100097656, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8558291792869568, + "num_tokens": 247950559.0, + "step": 6504 + }, + { + "epoch": 0.8275028622312682, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5101147890090942, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8719875812530518, + "num_tokens": 247989563.0, + "step": 6505 + }, + { + "epoch": 0.8276300725098588, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5200833082199097, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8594585657119751, + "num_tokens": 248032386.0, + "step": 6506 + }, + { + "epoch": 0.8277572827884493, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.7260065078735352, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8526967167854309, + "num_tokens": 248072395.0, + "step": 6507 + }, + { + "epoch": 0.8278844930670398, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6361414194107056, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8586840033531189, + "num_tokens": 248110654.0, + "step": 6508 + }, + { + "epoch": 0.8280117033456303, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6793524026870728, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8739279508590698, + "num_tokens": 248143902.0, + "step": 6509 + }, + { + "epoch": 0.8281389136242209, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4764037132263184, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8814834952354431, + "num_tokens": 248180605.0, + "step": 6510 + }, + { + "epoch": 0.8282661239028114, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6027045249938965, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8704535961151123, + "num_tokens": 248218378.0, + "step": 6511 + }, + { + "epoch": 0.8283933341814018, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5399051904678345, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8610866069793701, + "num_tokens": 248257652.0, + "step": 6512 + }, + { + "epoch": 0.8285205444599923, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5883629322052002, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8642141819000244, + "num_tokens": 248296927.0, + "step": 6513 + }, + { + "epoch": 0.8286477547385829, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5855605602264404, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8496351838111877, + "num_tokens": 248335018.0, + "step": 6514 + }, + { + "epoch": 0.8287749650171734, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5563217401504517, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8621975183486938, + "num_tokens": 248374837.0, + "step": 6515 + }, + { + "epoch": 0.8289021752957639, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.553391933441162, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8449773192405701, + "num_tokens": 248418754.0, + "step": 6516 + }, + { + "epoch": 0.8290293855743545, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.629460334777832, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8726058602333069, + "num_tokens": 248452252.0, + "step": 6517 + }, + { + "epoch": 0.8291565958529449, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4937423467636108, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8691675066947937, + "num_tokens": 248492224.0, + "step": 6518 + }, + { + "epoch": 0.8292838061315354, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5519853830337524, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8572372794151306, + "num_tokens": 248529504.0, + "step": 6519 + }, + { + "epoch": 0.8294110164101259, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.7938565015792847, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8563652634620667, + "num_tokens": 248561008.0, + "step": 6520 + }, + { + "epoch": 0.8295382266887165, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5744726657867432, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8723325133323669, + "num_tokens": 248598331.0, + "step": 6521 + }, + { + "epoch": 0.829665436967307, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5102368593215942, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8604340553283691, + "num_tokens": 248641266.0, + "step": 6522 + }, + { + "epoch": 0.8297926472458975, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5681687593460083, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8832024931907654, + "num_tokens": 248675036.0, + "step": 6523 + }, + { + "epoch": 0.8299198575244879, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5244615077972412, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8755097389221191, + "num_tokens": 248708524.0, + "step": 6524 + }, + { + "epoch": 0.8300470678030785, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6089167594909668, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8596696853637695, + "num_tokens": 248744808.0, + "step": 6525 + }, + { + "epoch": 0.830174278081669, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.541716456413269, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.86597740650177, + "num_tokens": 248782876.0, + "step": 6526 + }, + { + "epoch": 0.8303014883602595, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5754047632217407, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8655487895011902, + "num_tokens": 248818269.0, + "step": 6527 + }, + { + "epoch": 0.83042869863885, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5315243005752563, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.879611611366272, + "num_tokens": 248855566.0, + "step": 6528 + }, + { + "epoch": 0.8305559089174406, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.512444257736206, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.853826105594635, + "num_tokens": 248902169.0, + "step": 6529 + }, + { + "epoch": 0.830683119196031, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.554403305053711, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8737417459487915, + "num_tokens": 248940061.0, + "step": 6530 + }, + { + "epoch": 0.8308103294746215, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6042007207870483, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8595402240753174, + "num_tokens": 248979142.0, + "step": 6531 + }, + { + "epoch": 0.830937539753212, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6563962697982788, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8513846397399902, + "num_tokens": 249015385.0, + "step": 6532 + }, + { + "epoch": 0.8310647500318026, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.695310115814209, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8622198104858398, + "num_tokens": 249052948.0, + "step": 6533 + }, + { + "epoch": 0.8311919603103931, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.560115933418274, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8699999451637268, + "num_tokens": 249089048.0, + "step": 6534 + }, + { + "epoch": 0.8313191705889836, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5112214088439941, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8598474264144897, + "num_tokens": 249130086.0, + "step": 6535 + }, + { + "epoch": 0.831446380867574, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.428000569343567, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8711217045783997, + "num_tokens": 249171763.0, + "step": 6536 + }, + { + "epoch": 0.8315735911461646, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5780131816864014, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8538910746574402, + "num_tokens": 249211569.0, + "step": 6537 + }, + { + "epoch": 0.8317008014247551, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6515711545944214, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8849513530731201, + "num_tokens": 249244620.0, + "step": 6538 + }, + { + "epoch": 0.8318280117033456, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5300986766815186, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8462574481964111, + "num_tokens": 249290288.0, + "step": 6539 + }, + { + "epoch": 0.8319552219819362, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5977615118026733, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8624187707901001, + "num_tokens": 249327659.0, + "step": 6540 + }, + { + "epoch": 0.8320824322605267, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5656049251556396, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8566862940788269, + "num_tokens": 249369557.0, + "step": 6541 + }, + { + "epoch": 0.8322096425391171, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.634105920791626, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8677578568458557, + "num_tokens": 249406287.0, + "step": 6542 + }, + { + "epoch": 0.8323368528177076, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.3928426504135132, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8777463436126709, + "num_tokens": 249449005.0, + "step": 6543 + }, + { + "epoch": 0.8324640630962982, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4676486253738403, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8531738519668579, + "num_tokens": 249495051.0, + "step": 6544 + }, + { + "epoch": 0.8325912733748887, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.4955089092254639, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8758214712142944, + "num_tokens": 249531894.0, + "step": 6545 + }, + { + "epoch": 0.8327184836534792, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.3871145248413086, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8735957145690918, + "num_tokens": 249575337.0, + "step": 6546 + }, + { + "epoch": 0.8328456939320698, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5072134733200073, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.869209885597229, + "num_tokens": 249614505.0, + "step": 6547 + }, + { + "epoch": 0.8329729042106602, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6407108306884766, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.873015284538269, + "num_tokens": 249647057.0, + "step": 6548 + }, + { + "epoch": 0.8331001144892507, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.534834384918213, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8574368953704834, + "num_tokens": 249686421.0, + "step": 6549 + }, + { + "epoch": 0.8332273247678412, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.6168256998062134, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8716956377029419, + "num_tokens": 249723486.0, + "step": 6550 + }, + { + "epoch": 0.8333545350464318, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.624199390411377, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8533629775047302, + "num_tokens": 249762340.0, + "step": 6551 + }, + { + "epoch": 0.8334817453250223, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.5012922286987305, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8537116050720215, + "num_tokens": 249806925.0, + "step": 6552 + }, + { + "epoch": 0.8336089556036128, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.7240443229675293, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8597621917724609, + "num_tokens": 249843177.0, + "step": 6553 + }, + { + "epoch": 0.8337361658822032, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4882618188858032, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8706091642379761, + "num_tokens": 249882853.0, + "step": 6554 + }, + { + "epoch": 0.8338633761607938, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5363792181015015, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8695873022079468, + "num_tokens": 249920781.0, + "step": 6555 + }, + { + "epoch": 0.8339905864393843, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6138036251068115, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8642497658729553, + "num_tokens": 249956403.0, + "step": 6556 + }, + { + "epoch": 0.8341177967179748, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6514562368392944, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8845068216323853, + "num_tokens": 249989822.0, + "step": 6557 + }, + { + "epoch": 0.8342450069965653, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5752902030944824, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8794416189193726, + "num_tokens": 250023370.0, + "step": 6558 + }, + { + "epoch": 0.8343722172751559, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5524632930755615, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8574957251548767, + "num_tokens": 250062870.0, + "step": 6559 + }, + { + "epoch": 0.8344994275537464, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5121824741363525, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.870901882648468, + "num_tokens": 250103970.0, + "step": 6560 + }, + { + "epoch": 0.8346266378323368, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6122965812683105, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8740672469139099, + "num_tokens": 250137271.0, + "step": 6561 + }, + { + "epoch": 0.8347538481109273, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4561065435409546, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8622279167175293, + "num_tokens": 250183116.0, + "step": 6562 + }, + { + "epoch": 0.8348810583895179, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4740233421325684, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8724342584609985, + "num_tokens": 250224574.0, + "step": 6563 + }, + { + "epoch": 0.8350082686681084, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6023367643356323, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8493345975875854, + "num_tokens": 250265450.0, + "step": 6564 + }, + { + "epoch": 0.8351354789466989, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.580657958984375, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8673077821731567, + "num_tokens": 250301445.0, + "step": 6565 + }, + { + "epoch": 0.8352626892252895, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7075450420379639, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8667699098587036, + "num_tokens": 250334259.0, + "step": 6566 + }, + { + "epoch": 0.8353898995038799, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6012531518936157, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8681970238685608, + "num_tokens": 250368469.0, + "step": 6567 + }, + { + "epoch": 0.8355171097824704, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4806700944900513, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.872403621673584, + "num_tokens": 250408905.0, + "step": 6568 + }, + { + "epoch": 0.8356443200610609, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6940444707870483, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8588604927062988, + "num_tokens": 250446698.0, + "step": 6569 + }, + { + "epoch": 0.8357715303396515, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4649099111557007, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8686538934707642, + "num_tokens": 250491184.0, + "step": 6570 + }, + { + "epoch": 0.835898740618242, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5062050819396973, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.870215654373169, + "num_tokens": 250529837.0, + "step": 6571 + }, + { + "epoch": 0.8360259508968325, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4693289995193481, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8698709011077881, + "num_tokens": 250571776.0, + "step": 6572 + }, + { + "epoch": 0.8361531611754229, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6072150468826294, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8593812584877014, + "num_tokens": 250610199.0, + "step": 6573 + }, + { + "epoch": 0.8362803714540135, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5806747674942017, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8668553829193115, + "num_tokens": 250650214.0, + "step": 6574 + }, + { + "epoch": 0.836407581732604, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6377363204956055, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8590017557144165, + "num_tokens": 250689325.0, + "step": 6575 + }, + { + "epoch": 0.8365347920111945, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5404791831970215, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.857363224029541, + "num_tokens": 250730145.0, + "step": 6576 + }, + { + "epoch": 0.836662002289785, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5696766376495361, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8761559724807739, + "num_tokens": 250767340.0, + "step": 6577 + }, + { + "epoch": 0.8367892125683756, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.70598304271698, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8604528903961182, + "num_tokens": 250804613.0, + "step": 6578 + }, + { + "epoch": 0.836916422846966, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5231376886367798, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8655431270599365, + "num_tokens": 250848122.0, + "step": 6579 + }, + { + "epoch": 0.8370436331255565, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.670743703842163, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8534867167472839, + "num_tokens": 250881602.0, + "step": 6580 + }, + { + "epoch": 0.837170843404147, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.522133708000183, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8673859238624573, + "num_tokens": 250920906.0, + "step": 6581 + }, + { + "epoch": 0.8372980536827376, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5723676681518555, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8737187385559082, + "num_tokens": 250961720.0, + "step": 6582 + }, + { + "epoch": 0.8374252639613281, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4991395473480225, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8666595220565796, + "num_tokens": 251002064.0, + "step": 6583 + }, + { + "epoch": 0.8375524742399186, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5443798303604126, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8719813823699951, + "num_tokens": 251039103.0, + "step": 6584 + }, + { + "epoch": 0.837679684518509, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5858798027038574, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8698513507843018, + "num_tokens": 251076324.0, + "step": 6585 + }, + { + "epoch": 0.8378068947970996, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.603730320930481, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8637254238128662, + "num_tokens": 251114732.0, + "step": 6586 + }, + { + "epoch": 0.8379341050756901, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6349694728851318, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8680963516235352, + "num_tokens": 251152219.0, + "step": 6587 + }, + { + "epoch": 0.8380613153542806, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5365674495697021, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8787801265716553, + "num_tokens": 251185949.0, + "step": 6588 + }, + { + "epoch": 0.8381885256328712, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.516666054725647, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8796256184577942, + "num_tokens": 251223843.0, + "step": 6589 + }, + { + "epoch": 0.8383157359114617, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5000697374343872, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8675819039344788, + "num_tokens": 251266094.0, + "step": 6590 + }, + { + "epoch": 0.8384429461900521, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4223458766937256, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8759126663208008, + "num_tokens": 251309656.0, + "step": 6591 + }, + { + "epoch": 0.8385701564686426, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5555956363677979, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8585072755813599, + "num_tokens": 251352216.0, + "step": 6592 + }, + { + "epoch": 0.8386973667472332, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.55393648147583, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8742674589157104, + "num_tokens": 251391805.0, + "step": 6593 + }, + { + "epoch": 0.8388245770258237, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6433037519454956, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8456889390945435, + "num_tokens": 251428654.0, + "step": 6594 + }, + { + "epoch": 0.8389517873044142, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.524986743927002, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8520040512084961, + "num_tokens": 251470141.0, + "step": 6595 + }, + { + "epoch": 0.8390789975830047, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4858520030975342, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8692800998687744, + "num_tokens": 251514150.0, + "step": 6596 + }, + { + "epoch": 0.8392062078615952, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5914071798324585, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8636907935142517, + "num_tokens": 251552049.0, + "step": 6597 + }, + { + "epoch": 0.8393334181401857, + "ewc_loss": 1.800060272216797e-05, + "grad_norm": 1.8082984685897827, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8679412603378296, + "num_tokens": 251584928.0, + "step": 6598 + }, + { + "epoch": 0.8394606284187762, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7960292100906372, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8530974984169006, + "num_tokens": 251617785.0, + "step": 6599 + }, + { + "epoch": 0.8395878386973668, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6657794713974, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.861337423324585, + "num_tokens": 251652170.0, + "step": 6600 + }, + { + "epoch": 0.8397150489759573, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5713081359863281, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8693943619728088, + "num_tokens": 251689048.0, + "step": 6601 + }, + { + "epoch": 0.8398422592545478, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4419325590133667, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8737156391143799, + "num_tokens": 251730841.0, + "step": 6602 + }, + { + "epoch": 0.8399694695331382, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5681533813476562, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8706779479980469, + "num_tokens": 251769262.0, + "step": 6603 + }, + { + "epoch": 0.8400966798117288, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6218140125274658, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8620482087135315, + "num_tokens": 251805521.0, + "step": 6604 + }, + { + "epoch": 0.8402238900903193, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6170605421066284, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8738027811050415, + "num_tokens": 251843519.0, + "step": 6605 + }, + { + "epoch": 0.8403511003689098, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5789153575897217, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8433244824409485, + "num_tokens": 251888934.0, + "step": 6606 + }, + { + "epoch": 0.8404783106475003, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6359652280807495, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8583212494850159, + "num_tokens": 251929956.0, + "step": 6607 + }, + { + "epoch": 0.8406055209260909, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6150771379470825, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8552414774894714, + "num_tokens": 251968643.0, + "step": 6608 + }, + { + "epoch": 0.8407327312046813, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4940348863601685, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8637198209762573, + "num_tokens": 252012591.0, + "step": 6609 + }, + { + "epoch": 0.8408599414832718, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6166619062423706, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8820470571517944, + "num_tokens": 252047966.0, + "step": 6610 + }, + { + "epoch": 0.8409871517618623, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4773818254470825, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.877165675163269, + "num_tokens": 252088493.0, + "step": 6611 + }, + { + "epoch": 0.8411143620404529, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 20.863815307617188, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.851737380027771, + "num_tokens": 252125341.0, + "step": 6612 + }, + { + "epoch": 0.8412415723190434, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7413235902786255, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8443408012390137, + "num_tokens": 252159047.0, + "step": 6613 + }, + { + "epoch": 0.8413687825976339, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6911730766296387, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8599846363067627, + "num_tokens": 252193688.0, + "step": 6614 + }, + { + "epoch": 0.8414959928762245, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5326191186904907, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8736424446105957, + "num_tokens": 252231221.0, + "step": 6615 + }, + { + "epoch": 0.8416232031548149, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6252596378326416, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8643350601196289, + "num_tokens": 252269723.0, + "step": 6616 + }, + { + "epoch": 0.8417504134334054, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5937669277191162, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8737043142318726, + "num_tokens": 252302806.0, + "step": 6617 + }, + { + "epoch": 0.8418776237119959, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5512372255325317, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.863970160484314, + "num_tokens": 252342019.0, + "step": 6618 + }, + { + "epoch": 0.8420048339905865, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5552648305892944, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8670464754104614, + "num_tokens": 252383116.0, + "step": 6619 + }, + { + "epoch": 0.842132044269177, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.651196002960205, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8632732629776001, + "num_tokens": 252418584.0, + "step": 6620 + }, + { + "epoch": 0.8422592545477675, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.751090168952942, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8611838817596436, + "num_tokens": 252450516.0, + "step": 6621 + }, + { + "epoch": 0.8423864648263579, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.487007975578308, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.878534734249115, + "num_tokens": 252489838.0, + "step": 6622 + }, + { + "epoch": 0.8425136751049485, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6502097845077515, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8573718070983887, + "num_tokens": 252525999.0, + "step": 6623 + }, + { + "epoch": 0.842640885383539, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.624884843826294, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8663349747657776, + "num_tokens": 252561241.0, + "step": 6624 + }, + { + "epoch": 0.8427680956621295, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5523877143859863, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8672968745231628, + "num_tokens": 252600283.0, + "step": 6625 + }, + { + "epoch": 0.84289530594072, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.7646111249923706, + "learning_rate": 1e-06, + "loss": 0.5472, + "mean_token_accuracy": 0.8302062749862671, + "num_tokens": 252642872.0, + "step": 6626 + }, + { + "epoch": 0.8430225162193106, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5642650127410889, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8639687299728394, + "num_tokens": 252685897.0, + "step": 6627 + }, + { + "epoch": 0.843149726497901, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5436973571777344, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8632020354270935, + "num_tokens": 252723499.0, + "step": 6628 + }, + { + "epoch": 0.8432769367764915, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.656711220741272, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8709462881088257, + "num_tokens": 252754957.0, + "step": 6629 + }, + { + "epoch": 0.843404147055082, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6471285820007324, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8566210269927979, + "num_tokens": 252791309.0, + "step": 6630 + }, + { + "epoch": 0.8435313573336726, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5725653171539307, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.866035521030426, + "num_tokens": 252829206.0, + "step": 6631 + }, + { + "epoch": 0.8436585676122631, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.461819052696228, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8654136657714844, + "num_tokens": 252871124.0, + "step": 6632 + }, + { + "epoch": 0.8437857778908536, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5479793548583984, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.881565272808075, + "num_tokens": 252907508.0, + "step": 6633 + }, + { + "epoch": 0.843912988169444, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6394250392913818, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8517021536827087, + "num_tokens": 252949724.0, + "step": 6634 + }, + { + "epoch": 0.8440401984480346, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6177397966384888, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8682254552841187, + "num_tokens": 252987850.0, + "step": 6635 + }, + { + "epoch": 0.8441674087266251, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6129859685897827, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.862763524055481, + "num_tokens": 253026028.0, + "step": 6636 + }, + { + "epoch": 0.8442946190052156, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6553767919540405, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8531926274299622, + "num_tokens": 253067712.0, + "step": 6637 + }, + { + "epoch": 0.8444218292838062, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.632760763168335, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8856390714645386, + "num_tokens": 253101024.0, + "step": 6638 + }, + { + "epoch": 0.8445490395623967, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7162020206451416, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.862916886806488, + "num_tokens": 253133544.0, + "step": 6639 + }, + { + "epoch": 0.8446762498409871, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4102295637130737, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8643271923065186, + "num_tokens": 253180409.0, + "step": 6640 + }, + { + "epoch": 0.8448034601195776, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6994184255599976, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8566539883613586, + "num_tokens": 253212657.0, + "step": 6641 + }, + { + "epoch": 0.8449306703981682, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6451762914657593, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8701267242431641, + "num_tokens": 253249820.0, + "step": 6642 + }, + { + "epoch": 0.8450578806767587, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6919440031051636, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8709613084793091, + "num_tokens": 253283728.0, + "step": 6643 + }, + { + "epoch": 0.8451850909553492, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.579087257385254, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8594030737876892, + "num_tokens": 253327011.0, + "step": 6644 + }, + { + "epoch": 0.8453123012339397, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6287753582000732, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8663414716720581, + "num_tokens": 253362273.0, + "step": 6645 + }, + { + "epoch": 0.8454395115125302, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5379900932312012, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.868283748626709, + "num_tokens": 253402854.0, + "step": 6646 + }, + { + "epoch": 0.8455667217911207, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.527557134628296, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8583574295043945, + "num_tokens": 253444333.0, + "step": 6647 + }, + { + "epoch": 0.8456939320697112, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4900944232940674, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8759164810180664, + "num_tokens": 253485009.0, + "step": 6648 + }, + { + "epoch": 0.8458211423483017, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4967825412750244, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8609562516212463, + "num_tokens": 253523951.0, + "step": 6649 + }, + { + "epoch": 0.8459483526268923, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5800801515579224, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8735969066619873, + "num_tokens": 253560441.0, + "step": 6650 + }, + { + "epoch": 0.8460755629054828, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.506942629814148, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8715150356292725, + "num_tokens": 253602340.0, + "step": 6651 + }, + { + "epoch": 0.8462027731840732, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.484459400177002, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8579818606376648, + "num_tokens": 253642008.0, + "step": 6652 + }, + { + "epoch": 0.8463299834626637, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.58681058883667, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8698438405990601, + "num_tokens": 253677452.0, + "step": 6653 + }, + { + "epoch": 0.8464571937412543, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.671962857246399, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8325188755989075, + "num_tokens": 253717065.0, + "step": 6654 + }, + { + "epoch": 0.8465844040198448, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6284816265106201, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8637701272964478, + "num_tokens": 253757369.0, + "step": 6655 + }, + { + "epoch": 0.8467116142984353, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6210461854934692, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8777052760124207, + "num_tokens": 253791658.0, + "step": 6656 + }, + { + "epoch": 0.8468388245770259, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6903877258300781, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8601977229118347, + "num_tokens": 253825834.0, + "step": 6657 + }, + { + "epoch": 0.8469660348556163, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5029406547546387, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.870367169380188, + "num_tokens": 253865156.0, + "step": 6658 + }, + { + "epoch": 0.8470932451342068, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.596750259399414, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8720312118530273, + "num_tokens": 253896638.0, + "step": 6659 + }, + { + "epoch": 0.8472204554127973, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5864534378051758, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8580029010772705, + "num_tokens": 253932575.0, + "step": 6660 + }, + { + "epoch": 0.8473476656913879, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6186851263046265, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8644987940788269, + "num_tokens": 253969307.0, + "step": 6661 + }, + { + "epoch": 0.8474748759699784, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5063955783843994, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8638138175010681, + "num_tokens": 254009742.0, + "step": 6662 + }, + { + "epoch": 0.8476020862485689, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5210585594177246, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8698003888130188, + "num_tokens": 254048926.0, + "step": 6663 + }, + { + "epoch": 0.8477292965271594, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6112353801727295, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8636826872825623, + "num_tokens": 254084945.0, + "step": 6664 + }, + { + "epoch": 0.8478565068057499, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6432851552963257, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8750443458557129, + "num_tokens": 254120496.0, + "step": 6665 + }, + { + "epoch": 0.8479837170843404, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5806114673614502, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8592503070831299, + "num_tokens": 254157141.0, + "step": 6666 + }, + { + "epoch": 0.8481109273629309, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.602565884590149, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.855128824710846, + "num_tokens": 254197901.0, + "step": 6667 + }, + { + "epoch": 0.8482381376415215, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6140481233596802, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8662252426147461, + "num_tokens": 254235900.0, + "step": 6668 + }, + { + "epoch": 0.848365347920112, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5392457246780396, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8649320602416992, + "num_tokens": 254275102.0, + "step": 6669 + }, + { + "epoch": 0.8484925581987025, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5983734130859375, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8564146161079407, + "num_tokens": 254313551.0, + "step": 6670 + }, + { + "epoch": 0.8486197684772929, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5666502714157104, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8554925918579102, + "num_tokens": 254353281.0, + "step": 6671 + }, + { + "epoch": 0.8487469787558835, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6144742965698242, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8683398962020874, + "num_tokens": 254390754.0, + "step": 6672 + }, + { + "epoch": 0.848874189034474, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4697768688201904, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8712635040283203, + "num_tokens": 254432708.0, + "step": 6673 + }, + { + "epoch": 0.8490013993130645, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.697580099105835, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8590922355651855, + "num_tokens": 254469048.0, + "step": 6674 + }, + { + "epoch": 0.849128609591655, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.561769962310791, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8547400236129761, + "num_tokens": 254510218.0, + "step": 6675 + }, + { + "epoch": 0.8492558198702456, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6729028224945068, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8716890811920166, + "num_tokens": 254543486.0, + "step": 6676 + }, + { + "epoch": 0.849383030148836, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4957044124603271, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8678638339042664, + "num_tokens": 254583816.0, + "step": 6677 + }, + { + "epoch": 0.8495102404274265, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4897661209106445, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8554731607437134, + "num_tokens": 254625346.0, + "step": 6678 + }, + { + "epoch": 0.849637450706017, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.487735390663147, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8621644973754883, + "num_tokens": 254666742.0, + "step": 6679 + }, + { + "epoch": 0.8497646609846076, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7501308917999268, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8595803380012512, + "num_tokens": 254698823.0, + "step": 6680 + }, + { + "epoch": 0.8498918712631981, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6537854671478271, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8664572238922119, + "num_tokens": 254734855.0, + "step": 6681 + }, + { + "epoch": 0.8500190815417886, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4886444807052612, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8649427890777588, + "num_tokens": 254774552.0, + "step": 6682 + }, + { + "epoch": 0.850146291820379, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7946149110794067, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8659955263137817, + "num_tokens": 254810052.0, + "step": 6683 + }, + { + "epoch": 0.8502735020989696, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.63600754737854, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.878646731376648, + "num_tokens": 254843496.0, + "step": 6684 + }, + { + "epoch": 0.8504007123775601, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.522605061531067, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8809731006622314, + "num_tokens": 254881280.0, + "step": 6685 + }, + { + "epoch": 0.8505279226561506, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.671249270439148, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8702380657196045, + "num_tokens": 254919777.0, + "step": 6686 + }, + { + "epoch": 0.8506551329347412, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7077219486236572, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8627455234527588, + "num_tokens": 254953132.0, + "step": 6687 + }, + { + "epoch": 0.8507823432133317, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5180420875549316, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8618253469467163, + "num_tokens": 254995602.0, + "step": 6688 + }, + { + "epoch": 0.8509095534919221, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4980212450027466, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8842163681983948, + "num_tokens": 255035275.0, + "step": 6689 + }, + { + "epoch": 0.8510367637705126, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7492798566818237, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8419021368026733, + "num_tokens": 255068094.0, + "step": 6690 + }, + { + "epoch": 0.8511639740491032, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.547905683517456, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8665632009506226, + "num_tokens": 255108495.0, + "step": 6691 + }, + { + "epoch": 0.8512911843276937, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7312463521957397, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8741673231124878, + "num_tokens": 255138652.0, + "step": 6692 + }, + { + "epoch": 0.8514183946062842, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.732564091682434, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8533092141151428, + "num_tokens": 255173499.0, + "step": 6693 + }, + { + "epoch": 0.8515456048848747, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7994565963745117, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8393442630767822, + "num_tokens": 255211216.0, + "step": 6694 + }, + { + "epoch": 0.8516728151634652, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5494152307510376, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8633944988250732, + "num_tokens": 255253690.0, + "step": 6695 + }, + { + "epoch": 0.8518000254420557, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4467452764511108, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8684254288673401, + "num_tokens": 255294964.0, + "step": 6696 + }, + { + "epoch": 0.8519272357206462, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7655357122421265, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8462135791778564, + "num_tokens": 255328641.0, + "step": 6697 + }, + { + "epoch": 0.8520544459992367, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4832167625427246, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8617184162139893, + "num_tokens": 255369509.0, + "step": 6698 + }, + { + "epoch": 0.8521816562778273, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5010966062545776, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8677982687950134, + "num_tokens": 255407891.0, + "step": 6699 + }, + { + "epoch": 0.8523088665564178, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4575355052947998, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8779498934745789, + "num_tokens": 255445262.0, + "step": 6700 + }, + { + "epoch": 0.8524360768350082, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6021450757980347, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8767614960670471, + "num_tokens": 255482140.0, + "step": 6701 + }, + { + "epoch": 0.8525632871135987, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4822901487350464, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8811751008033752, + "num_tokens": 255525308.0, + "step": 6702 + }, + { + "epoch": 0.8526904973921893, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5950335264205933, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.857751190662384, + "num_tokens": 255564126.0, + "step": 6703 + }, + { + "epoch": 0.8528177076707798, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.589805245399475, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8737033605575562, + "num_tokens": 255602781.0, + "step": 6704 + }, + { + "epoch": 0.8529449179493703, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6741716861724854, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8709192276000977, + "num_tokens": 255638266.0, + "step": 6705 + }, + { + "epoch": 0.8530721282279609, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6010146141052246, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8388111591339111, + "num_tokens": 255677480.0, + "step": 6706 + }, + { + "epoch": 0.8531993385065513, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5200763940811157, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.872969925403595, + "num_tokens": 255715124.0, + "step": 6707 + }, + { + "epoch": 0.8533265487851418, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4908713102340698, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8644999861717224, + "num_tokens": 255755060.0, + "step": 6708 + }, + { + "epoch": 0.8534537590637323, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.594472050666809, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.875158429145813, + "num_tokens": 255790251.0, + "step": 6709 + }, + { + "epoch": 0.8535809693423229, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4937940835952759, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8514031171798706, + "num_tokens": 255835348.0, + "step": 6710 + }, + { + "epoch": 0.8537081796209134, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5051212310791016, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8655664920806885, + "num_tokens": 255879161.0, + "step": 6711 + }, + { + "epoch": 0.8538353898995039, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6903271675109863, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8642183542251587, + "num_tokens": 255912312.0, + "step": 6712 + }, + { + "epoch": 0.8539626001780944, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7348326444625854, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8734639883041382, + "num_tokens": 255943179.0, + "step": 6713 + }, + { + "epoch": 0.8540898104566849, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5711883306503296, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8667664527893066, + "num_tokens": 255980291.0, + "step": 6714 + }, + { + "epoch": 0.8542170207352754, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6051208972930908, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8680629730224609, + "num_tokens": 256014186.0, + "step": 6715 + }, + { + "epoch": 0.8543442310138659, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6018179655075073, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8605496287345886, + "num_tokens": 256050070.0, + "step": 6716 + }, + { + "epoch": 0.8544714412924564, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.736703634262085, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8591316342353821, + "num_tokens": 256089052.0, + "step": 6717 + }, + { + "epoch": 0.854598651571047, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.602181315422058, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8561278581619263, + "num_tokens": 256133365.0, + "step": 6718 + }, + { + "epoch": 0.8547258618496375, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.70787513256073, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.861591100692749, + "num_tokens": 256168244.0, + "step": 6719 + }, + { + "epoch": 0.8548530721282279, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6148377656936646, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8612741827964783, + "num_tokens": 256205699.0, + "step": 6720 + }, + { + "epoch": 0.8549802824068184, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6652108430862427, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8684953451156616, + "num_tokens": 256242491.0, + "step": 6721 + }, + { + "epoch": 0.855107492685409, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 3.6887898445129395, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8855908513069153, + "num_tokens": 256278000.0, + "step": 6722 + }, + { + "epoch": 0.8552347029639995, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6988601684570312, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8738393783569336, + "num_tokens": 256312855.0, + "step": 6723 + }, + { + "epoch": 0.85536191324259, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5833921432495117, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8712823390960693, + "num_tokens": 256350818.0, + "step": 6724 + }, + { + "epoch": 0.8554891235211806, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6398093700408936, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8550823330879211, + "num_tokens": 256389057.0, + "step": 6725 + }, + { + "epoch": 0.855616333799771, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.413824200630188, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8816782236099243, + "num_tokens": 256430253.0, + "step": 6726 + }, + { + "epoch": 0.8557435440783615, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5806970596313477, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.878477931022644, + "num_tokens": 256464815.0, + "step": 6727 + }, + { + "epoch": 0.855870754356952, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.670341968536377, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.877895712852478, + "num_tokens": 256496172.0, + "step": 6728 + }, + { + "epoch": 0.8559979646355426, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5979728698730469, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8687707185745239, + "num_tokens": 256530694.0, + "step": 6729 + }, + { + "epoch": 0.8561251749141331, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6604114770889282, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8709888458251953, + "num_tokens": 256565582.0, + "step": 6730 + }, + { + "epoch": 0.8562523851927236, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7079590559005737, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8554900884628296, + "num_tokens": 256599396.0, + "step": 6731 + }, + { + "epoch": 0.856379595471314, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5509541034698486, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8777602910995483, + "num_tokens": 256638571.0, + "step": 6732 + }, + { + "epoch": 0.8565068057499046, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5425019264221191, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8706063032150269, + "num_tokens": 256679477.0, + "step": 6733 + }, + { + "epoch": 0.8566340160284951, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.543741226196289, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8548028469085693, + "num_tokens": 256719916.0, + "step": 6734 + }, + { + "epoch": 0.8567612263070856, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.485144853591919, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8750477433204651, + "num_tokens": 256760620.0, + "step": 6735 + }, + { + "epoch": 0.8568884365856761, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6165521144866943, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8676069974899292, + "num_tokens": 256797963.0, + "step": 6736 + }, + { + "epoch": 0.8570156468642667, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5432407855987549, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8542953729629517, + "num_tokens": 256840554.0, + "step": 6737 + }, + { + "epoch": 0.8571428571428571, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5308247804641724, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8679168820381165, + "num_tokens": 256878536.0, + "step": 6738 + }, + { + "epoch": 0.8572700674214476, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6026273965835571, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8648688793182373, + "num_tokens": 256914598.0, + "step": 6739 + }, + { + "epoch": 0.8573972777000382, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.508519172668457, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8723914623260498, + "num_tokens": 256954947.0, + "step": 6740 + }, + { + "epoch": 0.8575244879786287, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6466327905654907, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.859764814376831, + "num_tokens": 256994709.0, + "step": 6741 + }, + { + "epoch": 0.8576516982572192, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6822847127914429, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8704771995544434, + "num_tokens": 257028081.0, + "step": 6742 + }, + { + "epoch": 0.8577789085358097, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6316567659378052, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.846004843711853, + "num_tokens": 257069658.0, + "step": 6743 + }, + { + "epoch": 0.8579061188144002, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.722109317779541, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8488931655883789, + "num_tokens": 257103565.0, + "step": 6744 + }, + { + "epoch": 0.8580333290929907, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4522113800048828, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8640400767326355, + "num_tokens": 257147376.0, + "step": 6745 + }, + { + "epoch": 0.8581605393715812, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4899507761001587, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8573790788650513, + "num_tokens": 257188583.0, + "step": 6746 + }, + { + "epoch": 0.8582877496501717, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6282565593719482, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8545119762420654, + "num_tokens": 257223878.0, + "step": 6747 + }, + { + "epoch": 0.8584149599287623, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 3.725247621536255, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8649938106536865, + "num_tokens": 257263023.0, + "step": 6748 + }, + { + "epoch": 0.8585421702073528, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6842321157455444, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8609473705291748, + "num_tokens": 257294224.0, + "step": 6749 + }, + { + "epoch": 0.8586693804859432, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5761042833328247, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8713080883026123, + "num_tokens": 257331776.0, + "step": 6750 + }, + { + "epoch": 0.8587965907645337, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.558415174484253, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8573874235153198, + "num_tokens": 257372677.0, + "step": 6751 + }, + { + "epoch": 0.8589238010431243, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6309924125671387, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8677423596382141, + "num_tokens": 257409760.0, + "step": 6752 + }, + { + "epoch": 0.8590510113217148, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5874207019805908, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8632920980453491, + "num_tokens": 257442109.0, + "step": 6753 + }, + { + "epoch": 0.8591782216003053, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5507880449295044, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8689079880714417, + "num_tokens": 257477602.0, + "step": 6754 + }, + { + "epoch": 0.8593054318788959, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5550612211227417, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8576889038085938, + "num_tokens": 257516658.0, + "step": 6755 + }, + { + "epoch": 0.8594326421574863, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5033811330795288, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8714654445648193, + "num_tokens": 257555961.0, + "step": 6756 + }, + { + "epoch": 0.8595598524360768, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5112736225128174, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8596693277359009, + "num_tokens": 257597294.0, + "step": 6757 + }, + { + "epoch": 0.8596870627146673, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5254430770874023, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8696510195732117, + "num_tokens": 257633848.0, + "step": 6758 + }, + { + "epoch": 0.8598142729932579, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.481296181678772, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8602825403213501, + "num_tokens": 257679440.0, + "step": 6759 + }, + { + "epoch": 0.8599414832718484, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5692076683044434, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8643385171890259, + "num_tokens": 257716585.0, + "step": 6760 + }, + { + "epoch": 0.8600686935504389, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5628633499145508, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8577592372894287, + "num_tokens": 257754587.0, + "step": 6761 + }, + { + "epoch": 0.8601959038290294, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5245695114135742, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8594546914100647, + "num_tokens": 257797947.0, + "step": 6762 + }, + { + "epoch": 0.8603231141076199, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6903424263000488, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8708548545837402, + "num_tokens": 257830609.0, + "step": 6763 + }, + { + "epoch": 0.8604503243862104, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6018877029418945, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8624028563499451, + "num_tokens": 257871911.0, + "step": 6764 + }, + { + "epoch": 0.8605775346648009, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.834391713142395, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8386411666870117, + "num_tokens": 257907910.0, + "step": 6765 + }, + { + "epoch": 0.8607047449433914, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5422091484069824, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8494409918785095, + "num_tokens": 257947570.0, + "step": 6766 + }, + { + "epoch": 0.860831955221982, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.7753328084945679, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8522585034370422, + "num_tokens": 257980018.0, + "step": 6767 + }, + { + "epoch": 0.8609591655005725, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5104098320007324, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8780924677848816, + "num_tokens": 258020779.0, + "step": 6768 + }, + { + "epoch": 0.8610863757791629, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5594875812530518, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8608353734016418, + "num_tokens": 258061040.0, + "step": 6769 + }, + { + "epoch": 0.8612135860577534, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4921878576278687, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8647019863128662, + "num_tokens": 258103773.0, + "step": 6770 + }, + { + "epoch": 0.861340796336344, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6280848979949951, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8672885894775391, + "num_tokens": 258142120.0, + "step": 6771 + }, + { + "epoch": 0.8614680066149345, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.4949824810028076, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8617282509803772, + "num_tokens": 258183953.0, + "step": 6772 + }, + { + "epoch": 0.861595216893525, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.559903860092163, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8695831298828125, + "num_tokens": 258221399.0, + "step": 6773 + }, + { + "epoch": 0.8617224271721156, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.5825746059417725, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8578959703445435, + "num_tokens": 258256899.0, + "step": 6774 + }, + { + "epoch": 0.861849637450706, + "ewc_loss": 1.811981201171875e-05, + "grad_norm": 1.6197071075439453, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8732429146766663, + "num_tokens": 258289921.0, + "step": 6775 + }, + { + "epoch": 0.8619768477292965, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.3310747146606445, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8813574314117432, + "num_tokens": 258325929.0, + "step": 6776 + }, + { + "epoch": 0.862104058007887, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6436488628387451, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8733971118927002, + "num_tokens": 258362185.0, + "step": 6777 + }, + { + "epoch": 0.8622312682864776, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6562482118606567, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8522239923477173, + "num_tokens": 258398925.0, + "step": 6778 + }, + { + "epoch": 0.8623584785650681, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.9959675073623657, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8581917881965637, + "num_tokens": 258428148.0, + "step": 6779 + }, + { + "epoch": 0.8624856888436586, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6976540088653564, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8726752996444702, + "num_tokens": 258467424.0, + "step": 6780 + }, + { + "epoch": 0.862612899122249, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7373888492584229, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8536522388458252, + "num_tokens": 258504190.0, + "step": 6781 + }, + { + "epoch": 0.8627401094008396, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.588877558708191, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8741905689239502, + "num_tokens": 258538631.0, + "step": 6782 + }, + { + "epoch": 0.8628673196794301, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7017980813980103, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8585939407348633, + "num_tokens": 258573545.0, + "step": 6783 + }, + { + "epoch": 0.8629945299580206, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5589704513549805, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8799720406532288, + "num_tokens": 258612314.0, + "step": 6784 + }, + { + "epoch": 0.8631217402366111, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7518229484558105, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8554847836494446, + "num_tokens": 258651179.0, + "step": 6785 + }, + { + "epoch": 0.8632489505152017, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4823867082595825, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.88438880443573, + "num_tokens": 258689071.0, + "step": 6786 + }, + { + "epoch": 0.8633761607937921, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4684357643127441, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8624320030212402, + "num_tokens": 258733136.0, + "step": 6787 + }, + { + "epoch": 0.8635033710723826, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5481585264205933, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8560839891433716, + "num_tokens": 258771391.0, + "step": 6788 + }, + { + "epoch": 0.8636305813509731, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.69473397731781, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8732122182846069, + "num_tokens": 258806626.0, + "step": 6789 + }, + { + "epoch": 0.8637577916295637, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4844895601272583, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8761776685714722, + "num_tokens": 258847184.0, + "step": 6790 + }, + { + "epoch": 0.8638850019081542, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5959054231643677, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8611843585968018, + "num_tokens": 258885613.0, + "step": 6791 + }, + { + "epoch": 0.8640122121867447, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5965315103530884, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8708985447883606, + "num_tokens": 258926716.0, + "step": 6792 + }, + { + "epoch": 0.8641394224653351, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6962076425552368, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8500176072120667, + "num_tokens": 258964541.0, + "step": 6793 + }, + { + "epoch": 0.8642666327439257, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6394977569580078, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8626466393470764, + "num_tokens": 259002717.0, + "step": 6794 + }, + { + "epoch": 0.8643938430225162, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.8827109336853027, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8713252544403076, + "num_tokens": 259039563.0, + "step": 6795 + }, + { + "epoch": 0.8645210533011067, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5730839967727661, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8641145825386047, + "num_tokens": 259081185.0, + "step": 6796 + }, + { + "epoch": 0.8646482635796973, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6081960201263428, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8691275715827942, + "num_tokens": 259115652.0, + "step": 6797 + }, + { + "epoch": 0.8647754738582878, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5802910327911377, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8711483478546143, + "num_tokens": 259154229.0, + "step": 6798 + }, + { + "epoch": 0.8649026841368782, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5149861574172974, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8786208629608154, + "num_tokens": 259191416.0, + "step": 6799 + }, + { + "epoch": 0.8650298944154687, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6268552541732788, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8584519624710083, + "num_tokens": 259228455.0, + "step": 6800 + }, + { + "epoch": 0.8651571046940593, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.730546474456787, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8437530994415283, + "num_tokens": 259269558.0, + "step": 6801 + }, + { + "epoch": 0.8652843149726498, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5005135536193848, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8677253723144531, + "num_tokens": 259308678.0, + "step": 6802 + }, + { + "epoch": 0.8654115252512403, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.605069637298584, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8715067505836487, + "num_tokens": 259343797.0, + "step": 6803 + }, + { + "epoch": 0.8655387355298308, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4598195552825928, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8759653568267822, + "num_tokens": 259382111.0, + "step": 6804 + }, + { + "epoch": 0.8656659458084213, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.663553237915039, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8576818108558655, + "num_tokens": 259415738.0, + "step": 6805 + }, + { + "epoch": 0.8657931560870118, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.527585506439209, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8633493185043335, + "num_tokens": 259456906.0, + "step": 6806 + }, + { + "epoch": 0.8659203663656023, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4649258852005005, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8883015513420105, + "num_tokens": 259494330.0, + "step": 6807 + }, + { + "epoch": 0.8660475766441929, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7139328718185425, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8761273622512817, + "num_tokens": 259527335.0, + "step": 6808 + }, + { + "epoch": 0.8661747869227834, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5562387704849243, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8748468160629272, + "num_tokens": 259560988.0, + "step": 6809 + }, + { + "epoch": 0.8663019972013739, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5233567953109741, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8754939436912537, + "num_tokens": 259598020.0, + "step": 6810 + }, + { + "epoch": 0.8664292074799644, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4095971584320068, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.873066782951355, + "num_tokens": 259640840.0, + "step": 6811 + }, + { + "epoch": 0.8665564177585549, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4569765329360962, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8707196712493896, + "num_tokens": 259682056.0, + "step": 6812 + }, + { + "epoch": 0.8666836280371454, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5951167345046997, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8628498911857605, + "num_tokens": 259723311.0, + "step": 6813 + }, + { + "epoch": 0.8668108383157359, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6522136926651, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8641320466995239, + "num_tokens": 259760106.0, + "step": 6814 + }, + { + "epoch": 0.8669380485943264, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5348085165023804, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8686672449111938, + "num_tokens": 259798550.0, + "step": 6815 + }, + { + "epoch": 0.867065258872917, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4867568016052246, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8701608180999756, + "num_tokens": 259841507.0, + "step": 6816 + }, + { + "epoch": 0.8671924691515075, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5872502326965332, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8555086851119995, + "num_tokens": 259881868.0, + "step": 6817 + }, + { + "epoch": 0.8673196794300979, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7186671495437622, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8657342791557312, + "num_tokens": 259912446.0, + "step": 6818 + }, + { + "epoch": 0.8674468897086884, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5655521154403687, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8656511306762695, + "num_tokens": 259956510.0, + "step": 6819 + }, + { + "epoch": 0.867574099987279, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4825067520141602, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8621900677680969, + "num_tokens": 259997912.0, + "step": 6820 + }, + { + "epoch": 0.8677013102658695, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7229909896850586, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8509364128112793, + "num_tokens": 260030463.0, + "step": 6821 + }, + { + "epoch": 0.86782852054446, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5460675954818726, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8598301410675049, + "num_tokens": 260072144.0, + "step": 6822 + }, + { + "epoch": 0.8679557308230506, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5566486120224, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.883712112903595, + "num_tokens": 260109434.0, + "step": 6823 + }, + { + "epoch": 0.868082941101641, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.547980785369873, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8705828189849854, + "num_tokens": 260147018.0, + "step": 6824 + }, + { + "epoch": 0.8682101513802315, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5651861429214478, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8655784130096436, + "num_tokens": 260183593.0, + "step": 6825 + }, + { + "epoch": 0.868337361658822, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6612967252731323, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.86429363489151, + "num_tokens": 260220567.0, + "step": 6826 + }, + { + "epoch": 0.8684645719374126, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6225597858428955, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8670431971549988, + "num_tokens": 260257939.0, + "step": 6827 + }, + { + "epoch": 0.8685917822160031, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.701080083847046, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8594788312911987, + "num_tokens": 260300234.0, + "step": 6828 + }, + { + "epoch": 0.8687189924945936, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6115375757217407, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8695155382156372, + "num_tokens": 260339172.0, + "step": 6829 + }, + { + "epoch": 0.868846202773184, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.507940649986267, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8718687891960144, + "num_tokens": 260377115.0, + "step": 6830 + }, + { + "epoch": 0.8689734130517746, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5090923309326172, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.86275315284729, + "num_tokens": 260418740.0, + "step": 6831 + }, + { + "epoch": 0.8691006233303651, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6201555728912354, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8336116075515747, + "num_tokens": 260468537.0, + "step": 6832 + }, + { + "epoch": 0.8692278336089556, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.649404764175415, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8811020851135254, + "num_tokens": 260505602.0, + "step": 6833 + }, + { + "epoch": 0.8693550438875461, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5900055170059204, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8687859177589417, + "num_tokens": 260539064.0, + "step": 6834 + }, + { + "epoch": 0.8694822541661367, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 3.687126636505127, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8766304850578308, + "num_tokens": 260572235.0, + "step": 6835 + }, + { + "epoch": 0.8696094644447271, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5079518556594849, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8599595427513123, + "num_tokens": 260620092.0, + "step": 6836 + }, + { + "epoch": 0.8697366747233176, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5202569961547852, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8882623314857483, + "num_tokens": 260657769.0, + "step": 6837 + }, + { + "epoch": 0.8698638850019081, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7284703254699707, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8752257823944092, + "num_tokens": 260690954.0, + "step": 6838 + }, + { + "epoch": 0.8699910952804987, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6420634984970093, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8607087135314941, + "num_tokens": 260729376.0, + "step": 6839 + }, + { + "epoch": 0.8701183055590892, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5250024795532227, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8602645993232727, + "num_tokens": 260772100.0, + "step": 6840 + }, + { + "epoch": 0.8702455158376797, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5760637521743774, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8641980290412903, + "num_tokens": 260810080.0, + "step": 6841 + }, + { + "epoch": 0.8703727261162701, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.581741213798523, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8547496199607849, + "num_tokens": 260850133.0, + "step": 6842 + }, + { + "epoch": 0.8704999363948607, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5276927947998047, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8647022843360901, + "num_tokens": 260891553.0, + "step": 6843 + }, + { + "epoch": 0.8706271466734512, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7137900590896606, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8759238719940186, + "num_tokens": 260929881.0, + "step": 6844 + }, + { + "epoch": 0.8707543569520417, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6197270154953003, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8525922298431396, + "num_tokens": 260968238.0, + "step": 6845 + }, + { + "epoch": 0.8708815672306323, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5443700551986694, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8623575568199158, + "num_tokens": 261008107.0, + "step": 6846 + }, + { + "epoch": 0.8710087775092228, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4556660652160645, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8770778179168701, + "num_tokens": 261052354.0, + "step": 6847 + }, + { + "epoch": 0.8711359877878132, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.612399935722351, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.865249752998352, + "num_tokens": 261090519.0, + "step": 6848 + }, + { + "epoch": 0.8712631980664037, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.519532561302185, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8621066212654114, + "num_tokens": 261132654.0, + "step": 6849 + }, + { + "epoch": 0.8713904083449943, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.44497549533844, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8705196380615234, + "num_tokens": 261178942.0, + "step": 6850 + }, + { + "epoch": 0.8715176186235848, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6036615371704102, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8581210374832153, + "num_tokens": 261221254.0, + "step": 6851 + }, + { + "epoch": 0.8716448289021753, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.8220000267028809, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8449822664260864, + "num_tokens": 261258488.0, + "step": 6852 + }, + { + "epoch": 0.8717720391807658, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5704697370529175, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8462715148925781, + "num_tokens": 261300488.0, + "step": 6853 + }, + { + "epoch": 0.8718992494593563, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.732166051864624, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8785104751586914, + "num_tokens": 261337711.0, + "step": 6854 + }, + { + "epoch": 0.8720264597379468, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6004945039749146, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8732808828353882, + "num_tokens": 261375297.0, + "step": 6855 + }, + { + "epoch": 0.8721536700165373, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5481795072555542, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8701876401901245, + "num_tokens": 261416395.0, + "step": 6856 + }, + { + "epoch": 0.8722808802951278, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7112561464309692, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8640351295471191, + "num_tokens": 261452116.0, + "step": 6857 + }, + { + "epoch": 0.8724080905737184, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5776747465133667, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8703277707099915, + "num_tokens": 261486606.0, + "step": 6858 + }, + { + "epoch": 0.8725353008523089, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6797679662704468, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8632947206497192, + "num_tokens": 261523188.0, + "step": 6859 + }, + { + "epoch": 0.8726625111308994, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.656167984008789, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.839248538017273, + "num_tokens": 261563866.0, + "step": 6860 + }, + { + "epoch": 0.8727897214094898, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6769717931747437, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8589890003204346, + "num_tokens": 261603162.0, + "step": 6861 + }, + { + "epoch": 0.8729169316880804, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7449569702148438, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8437486886978149, + "num_tokens": 261643398.0, + "step": 6862 + }, + { + "epoch": 0.8730441419666709, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 3.6979098320007324, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8673526644706726, + "num_tokens": 261677973.0, + "step": 6863 + }, + { + "epoch": 0.8731713522452614, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4519566297531128, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8727894425392151, + "num_tokens": 261723364.0, + "step": 6864 + }, + { + "epoch": 0.873298562523852, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6396896839141846, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8713022470474243, + "num_tokens": 261759234.0, + "step": 6865 + }, + { + "epoch": 0.8734257728024425, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.625291347503662, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8708987832069397, + "num_tokens": 261796485.0, + "step": 6866 + }, + { + "epoch": 0.8735529830810329, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.624193549156189, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8563765287399292, + "num_tokens": 261835031.0, + "step": 6867 + }, + { + "epoch": 0.8736801933596234, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5496658086776733, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8664040565490723, + "num_tokens": 261876892.0, + "step": 6868 + }, + { + "epoch": 0.873807403638214, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5962854623794556, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.867378830909729, + "num_tokens": 261916006.0, + "step": 6869 + }, + { + "epoch": 0.8739346139168045, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5620516538619995, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8757281303405762, + "num_tokens": 261952324.0, + "step": 6870 + }, + { + "epoch": 0.874061824195395, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.477923035621643, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8696557283401489, + "num_tokens": 261995706.0, + "step": 6871 + }, + { + "epoch": 0.8741890344739855, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6686049699783325, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8554676175117493, + "num_tokens": 262031375.0, + "step": 6872 + }, + { + "epoch": 0.874316244752576, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.539232850074768, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8671615123748779, + "num_tokens": 262071932.0, + "step": 6873 + }, + { + "epoch": 0.8744434550311665, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.504475712776184, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8768804669380188, + "num_tokens": 262107596.0, + "step": 6874 + }, + { + "epoch": 0.874570665309757, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5109564065933228, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.866125226020813, + "num_tokens": 262149157.0, + "step": 6875 + }, + { + "epoch": 0.8746978755883476, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5618559122085571, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8651005625724792, + "num_tokens": 262186520.0, + "step": 6876 + }, + { + "epoch": 0.8748250858669381, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.579201340675354, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8495793342590332, + "num_tokens": 262227366.0, + "step": 6877 + }, + { + "epoch": 0.8749522961455286, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6491345167160034, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8578712940216064, + "num_tokens": 262264314.0, + "step": 6878 + }, + { + "epoch": 0.875079506424119, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.235746383666992, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8677855730056763, + "num_tokens": 262301444.0, + "step": 6879 + }, + { + "epoch": 0.8752067167027096, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.60468327999115, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8652881383895874, + "num_tokens": 262341943.0, + "step": 6880 + }, + { + "epoch": 0.8753339269813001, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6906518936157227, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8717443346977234, + "num_tokens": 262376007.0, + "step": 6881 + }, + { + "epoch": 0.8754611372598906, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5510585308074951, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8628837466239929, + "num_tokens": 262417103.0, + "step": 6882 + }, + { + "epoch": 0.8755883475384811, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5514034032821655, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8582538962364197, + "num_tokens": 262458175.0, + "step": 6883 + }, + { + "epoch": 0.8757155578170717, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5901297330856323, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.84942626953125, + "num_tokens": 262500782.0, + "step": 6884 + }, + { + "epoch": 0.8758427680956621, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.9954675436019897, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8634442687034607, + "num_tokens": 262541939.0, + "step": 6885 + }, + { + "epoch": 0.8759699783742526, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.617968201637268, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8656368851661682, + "num_tokens": 262577593.0, + "step": 6886 + }, + { + "epoch": 0.8760971886528431, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5848395824432373, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8594341278076172, + "num_tokens": 262616041.0, + "step": 6887 + }, + { + "epoch": 0.8762243989314337, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4819248914718628, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.869466245174408, + "num_tokens": 262655876.0, + "step": 6888 + }, + { + "epoch": 0.8763516092100242, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4800803661346436, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8598759174346924, + "num_tokens": 262697078.0, + "step": 6889 + }, + { + "epoch": 0.8764788194886147, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.76069176197052, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8682065010070801, + "num_tokens": 262729584.0, + "step": 6890 + }, + { + "epoch": 0.8766060297672051, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5475679636001587, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8502136468887329, + "num_tokens": 262768800.0, + "step": 6891 + }, + { + "epoch": 0.8767332400457957, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5589519739151, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8582584857940674, + "num_tokens": 262811246.0, + "step": 6892 + }, + { + "epoch": 0.8768604503243862, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7325290441513062, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8438951373100281, + "num_tokens": 262844771.0, + "step": 6893 + }, + { + "epoch": 0.8769876606029767, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5995851755142212, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8549676537513733, + "num_tokens": 262884116.0, + "step": 6894 + }, + { + "epoch": 0.8771148708815673, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6193864345550537, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8610856533050537, + "num_tokens": 262919905.0, + "step": 6895 + }, + { + "epoch": 0.8772420811601578, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.557025671005249, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.864808201789856, + "num_tokens": 262960578.0, + "step": 6896 + }, + { + "epoch": 0.8773692914387482, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6623717546463013, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8599637150764465, + "num_tokens": 262996601.0, + "step": 6897 + }, + { + "epoch": 0.8774965017173387, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5878742933273315, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8641864061355591, + "num_tokens": 263032810.0, + "step": 6898 + }, + { + "epoch": 0.8776237119959293, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6245765686035156, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8384033441543579, + "num_tokens": 263073050.0, + "step": 6899 + }, + { + "epoch": 0.8777509222745198, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.57175612449646, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8640965223312378, + "num_tokens": 263111597.0, + "step": 6900 + }, + { + "epoch": 0.8778781325531103, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5743542909622192, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8761930465698242, + "num_tokens": 263150477.0, + "step": 6901 + }, + { + "epoch": 0.8780053428317008, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5026414394378662, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8819342851638794, + "num_tokens": 263190529.0, + "step": 6902 + }, + { + "epoch": 0.8781325531102913, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5817841291427612, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8753719329833984, + "num_tokens": 263224225.0, + "step": 6903 + }, + { + "epoch": 0.8782597633888818, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7860246896743774, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8600497245788574, + "num_tokens": 263254471.0, + "step": 6904 + }, + { + "epoch": 0.8783869736674723, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5161436796188354, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8645778298377991, + "num_tokens": 263293675.0, + "step": 6905 + }, + { + "epoch": 0.8785141839460628, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.509861707687378, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8711591362953186, + "num_tokens": 263335069.0, + "step": 6906 + }, + { + "epoch": 0.8786413942246534, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5225396156311035, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8875383734703064, + "num_tokens": 263371141.0, + "step": 6907 + }, + { + "epoch": 0.8787686045032439, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5451481342315674, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8626161813735962, + "num_tokens": 263408085.0, + "step": 6908 + }, + { + "epoch": 0.8788958147818343, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4309628009796143, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8749872446060181, + "num_tokens": 263452306.0, + "step": 6909 + }, + { + "epoch": 0.8790230250604248, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.667711853981018, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8697284460067749, + "num_tokens": 263487567.0, + "step": 6910 + }, + { + "epoch": 0.8791502353390154, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6953370571136475, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8655299544334412, + "num_tokens": 263524851.0, + "step": 6911 + }, + { + "epoch": 0.8792774456176059, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5198228359222412, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8695870637893677, + "num_tokens": 263564008.0, + "step": 6912 + }, + { + "epoch": 0.8794046558961964, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4770764112472534, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8670216798782349, + "num_tokens": 263610572.0, + "step": 6913 + }, + { + "epoch": 0.879531866174787, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5209506750106812, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8603488206863403, + "num_tokens": 263653280.0, + "step": 6914 + }, + { + "epoch": 0.8796590764533775, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.672471523284912, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8733470439910889, + "num_tokens": 263688077.0, + "step": 6915 + }, + { + "epoch": 0.8797862867319679, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6936933994293213, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8667280673980713, + "num_tokens": 263719254.0, + "step": 6916 + }, + { + "epoch": 0.8799134970105584, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7334516048431396, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8518847823143005, + "num_tokens": 263760354.0, + "step": 6917 + }, + { + "epoch": 0.880040707289149, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4727085828781128, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.876759946346283, + "num_tokens": 263797290.0, + "step": 6918 + }, + { + "epoch": 0.8801679175677395, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5433974266052246, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8582046031951904, + "num_tokens": 263837802.0, + "step": 6919 + }, + { + "epoch": 0.88029512784633, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5082114934921265, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8643827438354492, + "num_tokens": 263874611.0, + "step": 6920 + }, + { + "epoch": 0.8804223381249205, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5506736040115356, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8706285953521729, + "num_tokens": 263913872.0, + "step": 6921 + }, + { + "epoch": 0.880549548403511, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5630544424057007, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8731509447097778, + "num_tokens": 263947002.0, + "step": 6922 + }, + { + "epoch": 0.8806767586821015, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5238571166992188, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8766015768051147, + "num_tokens": 263988221.0, + "step": 6923 + }, + { + "epoch": 0.880803968960692, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5455116033554077, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8653782606124878, + "num_tokens": 264026166.0, + "step": 6924 + }, + { + "epoch": 0.8809311792392825, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6417479515075684, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8473283052444458, + "num_tokens": 264064279.0, + "step": 6925 + }, + { + "epoch": 0.8810583895178731, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.651391625404358, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8564005494117737, + "num_tokens": 264098638.0, + "step": 6926 + }, + { + "epoch": 0.8811855997964636, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4773938655853271, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8725104331970215, + "num_tokens": 264136601.0, + "step": 6927 + }, + { + "epoch": 0.881312810075054, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5339024066925049, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8640604019165039, + "num_tokens": 264178543.0, + "step": 6928 + }, + { + "epoch": 0.8814400203536445, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7202969789505005, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8438607454299927, + "num_tokens": 264214572.0, + "step": 6929 + }, + { + "epoch": 0.8815672306322351, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6306158304214478, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8581500053405762, + "num_tokens": 264255191.0, + "step": 6930 + }, + { + "epoch": 0.8816944409108256, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5590651035308838, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8591187000274658, + "num_tokens": 264292985.0, + "step": 6931 + }, + { + "epoch": 0.8818216511894161, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4331133365631104, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8712539672851562, + "num_tokens": 264338050.0, + "step": 6932 + }, + { + "epoch": 0.8819488614680067, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.489606499671936, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8603998422622681, + "num_tokens": 264378871.0, + "step": 6933 + }, + { + "epoch": 0.8820760717465971, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7418467998504639, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8718147277832031, + "num_tokens": 264408737.0, + "step": 6934 + }, + { + "epoch": 0.8822032820251876, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.524916648864746, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8677383661270142, + "num_tokens": 264447414.0, + "step": 6935 + }, + { + "epoch": 0.8823304923037781, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.601819396018982, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8518362641334534, + "num_tokens": 264488056.0, + "step": 6936 + }, + { + "epoch": 0.8824577025823687, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.3991189002990723, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8665290474891663, + "num_tokens": 264536591.0, + "step": 6937 + }, + { + "epoch": 0.8825849128609592, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4957382678985596, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8663811683654785, + "num_tokens": 264578427.0, + "step": 6938 + }, + { + "epoch": 0.8827121231395497, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6783579587936401, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8548756837844849, + "num_tokens": 264614381.0, + "step": 6939 + }, + { + "epoch": 0.8828393334181401, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5124762058258057, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8741727471351624, + "num_tokens": 264655174.0, + "step": 6940 + }, + { + "epoch": 0.8829665436967307, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5913351774215698, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8562307357788086, + "num_tokens": 264694697.0, + "step": 6941 + }, + { + "epoch": 0.8830937539753212, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5996342897415161, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8740125894546509, + "num_tokens": 264729681.0, + "step": 6942 + }, + { + "epoch": 0.8832209642539117, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.649144172668457, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8567638397216797, + "num_tokens": 264766969.0, + "step": 6943 + }, + { + "epoch": 0.8833481745325023, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6816339492797852, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8619459867477417, + "num_tokens": 264802134.0, + "step": 6944 + }, + { + "epoch": 0.8834753848110928, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4576088190078735, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8580514192581177, + "num_tokens": 264846837.0, + "step": 6945 + }, + { + "epoch": 0.8836025950896832, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6040077209472656, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8729665279388428, + "num_tokens": 264881426.0, + "step": 6946 + }, + { + "epoch": 0.8837298053682737, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4972529411315918, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8621349930763245, + "num_tokens": 264924377.0, + "step": 6947 + }, + { + "epoch": 0.8838570156468643, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5259791612625122, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.863270103931427, + "num_tokens": 264962403.0, + "step": 6948 + }, + { + "epoch": 0.8839842259254548, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5509878396987915, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8531767129898071, + "num_tokens": 265001140.0, + "step": 6949 + }, + { + "epoch": 0.8841114362040453, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5765838623046875, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8684484958648682, + "num_tokens": 265038566.0, + "step": 6950 + }, + { + "epoch": 0.8842386464826358, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5645641088485718, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8628603219985962, + "num_tokens": 265077915.0, + "step": 6951 + }, + { + "epoch": 0.8843658567612263, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 2.1379427909851074, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8770349621772766, + "num_tokens": 265118653.0, + "step": 6952 + }, + { + "epoch": 0.8844930670398168, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.521166205406189, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8689824342727661, + "num_tokens": 265157781.0, + "step": 6953 + }, + { + "epoch": 0.8846202773184073, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.650408148765564, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8667501211166382, + "num_tokens": 265195828.0, + "step": 6954 + }, + { + "epoch": 0.8847474875969978, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.742912769317627, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.865511953830719, + "num_tokens": 265232064.0, + "step": 6955 + }, + { + "epoch": 0.8848746978755884, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7533578872680664, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8576165437698364, + "num_tokens": 265267027.0, + "step": 6956 + }, + { + "epoch": 0.8850019081541789, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6158584356307983, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8535786867141724, + "num_tokens": 265305046.0, + "step": 6957 + }, + { + "epoch": 0.8851291184327693, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.555545687675476, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8615618348121643, + "num_tokens": 265343651.0, + "step": 6958 + }, + { + "epoch": 0.8852563287113598, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5901890993118286, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.858710765838623, + "num_tokens": 265379985.0, + "step": 6959 + }, + { + "epoch": 0.8853835389899504, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6971744298934937, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8631187677383423, + "num_tokens": 265414307.0, + "step": 6960 + }, + { + "epoch": 0.8855107492685409, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5367876291275024, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8634288311004639, + "num_tokens": 265455515.0, + "step": 6961 + }, + { + "epoch": 0.8856379595471314, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6747790575027466, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8834142088890076, + "num_tokens": 265488094.0, + "step": 6962 + }, + { + "epoch": 0.885765169825722, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5878962278366089, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8583753705024719, + "num_tokens": 265526810.0, + "step": 6963 + }, + { + "epoch": 0.8858923801043125, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4958453178405762, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.875184178352356, + "num_tokens": 265567006.0, + "step": 6964 + }, + { + "epoch": 0.8860195903829029, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6834534406661987, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.860318124294281, + "num_tokens": 265603025.0, + "step": 6965 + }, + { + "epoch": 0.8861468006614934, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6254971027374268, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8482560515403748, + "num_tokens": 265640007.0, + "step": 6966 + }, + { + "epoch": 0.886274010940084, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.555909276008606, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8573959469795227, + "num_tokens": 265682929.0, + "step": 6967 + }, + { + "epoch": 0.8864012212186745, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5632972717285156, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8754978179931641, + "num_tokens": 265724469.0, + "step": 6968 + }, + { + "epoch": 0.886528431497265, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.7750524282455444, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8710598349571228, + "num_tokens": 265755678.0, + "step": 6969 + }, + { + "epoch": 0.8866556417758555, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 3.814462661743164, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.830042839050293, + "num_tokens": 265796566.0, + "step": 6970 + }, + { + "epoch": 0.886782852054446, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6217737197875977, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8814442753791809, + "num_tokens": 265835066.0, + "step": 6971 + }, + { + "epoch": 0.8869100623330365, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5056097507476807, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8701165914535522, + "num_tokens": 265877759.0, + "step": 6972 + }, + { + "epoch": 0.887037272611627, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6431151628494263, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8735178709030151, + "num_tokens": 265911331.0, + "step": 6973 + }, + { + "epoch": 0.8871644828902175, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5233818292617798, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8631604909896851, + "num_tokens": 265952968.0, + "step": 6974 + }, + { + "epoch": 0.8872916931688081, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6633224487304688, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8660971522331238, + "num_tokens": 265988456.0, + "step": 6975 + }, + { + "epoch": 0.8874189034473986, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.696393609046936, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8655389547348022, + "num_tokens": 266024757.0, + "step": 6976 + }, + { + "epoch": 0.887546113725989, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6129754781723022, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8592707514762878, + "num_tokens": 266063942.0, + "step": 6977 + }, + { + "epoch": 0.8876733240045795, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.561929702758789, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8766988515853882, + "num_tokens": 266096605.0, + "step": 6978 + }, + { + "epoch": 0.8878005342831701, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.532583475112915, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.863824725151062, + "num_tokens": 266136315.0, + "step": 6979 + }, + { + "epoch": 0.8879277445617606, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.663012146949768, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8662102818489075, + "num_tokens": 266168642.0, + "step": 6980 + }, + { + "epoch": 0.8880549548403511, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5793489217758179, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8582712411880493, + "num_tokens": 266208614.0, + "step": 6981 + }, + { + "epoch": 0.8881821651189417, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.790366768836975, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8636156916618347, + "num_tokens": 266238944.0, + "step": 6982 + }, + { + "epoch": 0.8883093753975321, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4734525680541992, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8799686431884766, + "num_tokens": 266279034.0, + "step": 6983 + }, + { + "epoch": 0.8884365856761226, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.706298828125, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8553088903427124, + "num_tokens": 266316483.0, + "step": 6984 + }, + { + "epoch": 0.8885637959547131, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4451903104782104, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.867402195930481, + "num_tokens": 266359588.0, + "step": 6985 + }, + { + "epoch": 0.8886910062333037, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5367786884307861, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8475280404090881, + "num_tokens": 266398569.0, + "step": 6986 + }, + { + "epoch": 0.8888182165118942, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5490533113479614, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8567477464675903, + "num_tokens": 266437136.0, + "step": 6987 + }, + { + "epoch": 0.8889454267904847, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.589227557182312, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.873150110244751, + "num_tokens": 266478071.0, + "step": 6988 + }, + { + "epoch": 0.8890726370690751, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5564088821411133, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8618318438529968, + "num_tokens": 266518212.0, + "step": 6989 + }, + { + "epoch": 0.8891998473476657, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5931042432785034, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8621567487716675, + "num_tokens": 266556674.0, + "step": 6990 + }, + { + "epoch": 0.8893270576262562, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6757668256759644, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.874608039855957, + "num_tokens": 266588801.0, + "step": 6991 + }, + { + "epoch": 0.8894542679048467, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.602748155593872, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8554208278656006, + "num_tokens": 266628051.0, + "step": 6992 + }, + { + "epoch": 0.8895814781834372, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.661773681640625, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8537378907203674, + "num_tokens": 266666918.0, + "step": 6993 + }, + { + "epoch": 0.8897086884620278, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.693455696105957, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.862617552280426, + "num_tokens": 266703411.0, + "step": 6994 + }, + { + "epoch": 0.8898358987406182, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.5721900463104248, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8519959449768066, + "num_tokens": 266743394.0, + "step": 6995 + }, + { + "epoch": 0.8899631090192087, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.498552918434143, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8880870342254639, + "num_tokens": 266784645.0, + "step": 6996 + }, + { + "epoch": 0.8900903192977992, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.9516202211380005, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8584811091423035, + "num_tokens": 266820387.0, + "step": 6997 + }, + { + "epoch": 0.8902175295763898, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4793068170547485, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8602821826934814, + "num_tokens": 266863217.0, + "step": 6998 + }, + { + "epoch": 0.8903447398549803, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4615824222564697, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8795381784439087, + "num_tokens": 266904414.0, + "step": 6999 + }, + { + "epoch": 0.8904719501335708, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.50961172580719, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8642556667327881, + "num_tokens": 266946709.0, + "step": 7000 + }, + { + "epoch": 0.8905991604121613, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6889853477478027, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8527799844741821, + "num_tokens": 266981904.0, + "step": 7001 + }, + { + "epoch": 0.8907263706907518, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6263781785964966, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.83582603931427, + "num_tokens": 267020351.0, + "step": 7002 + }, + { + "epoch": 0.8908535809693423, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.586000919342041, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.864915668964386, + "num_tokens": 267059720.0, + "step": 7003 + }, + { + "epoch": 0.8909807912479328, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.6077300310134888, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8527825474739075, + "num_tokens": 267101440.0, + "step": 7004 + }, + { + "epoch": 0.8911080015265234, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.4523389339447021, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.879524827003479, + "num_tokens": 267141146.0, + "step": 7005 + }, + { + "epoch": 0.8912352118051139, + "ewc_loss": 1.823902130126953e-05, + "grad_norm": 1.666976809501648, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8712122440338135, + "num_tokens": 267174321.0, + "step": 7006 + }, + { + "epoch": 0.8913624220837043, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.6291272640228271, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8576633334159851, + "num_tokens": 267214405.0, + "step": 7007 + }, + { + "epoch": 0.8914896323622948, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.3520828485488892, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8823884725570679, + "num_tokens": 267253174.0, + "step": 7008 + }, + { + "epoch": 0.8916168426408854, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.4853026866912842, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8534637689590454, + "num_tokens": 267299098.0, + "step": 7009 + }, + { + "epoch": 0.8917440529194759, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5824345350265503, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8622478246688843, + "num_tokens": 267333612.0, + "step": 7010 + }, + { + "epoch": 0.8918712631980664, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.495573878288269, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8719156384468079, + "num_tokens": 267374036.0, + "step": 7011 + }, + { + "epoch": 0.891998473476657, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.7503256797790527, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8601333498954773, + "num_tokens": 267407453.0, + "step": 7012 + }, + { + "epoch": 0.8921256837552475, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5916144847869873, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8662217855453491, + "num_tokens": 267440996.0, + "step": 7013 + }, + { + "epoch": 0.8922528940338379, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.6560968160629272, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8584773540496826, + "num_tokens": 267480116.0, + "step": 7014 + }, + { + "epoch": 0.8923801043124284, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5406701564788818, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8568433523178101, + "num_tokens": 267523638.0, + "step": 7015 + }, + { + "epoch": 0.892507314591019, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.4966083765029907, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8683395385742188, + "num_tokens": 267564069.0, + "step": 7016 + }, + { + "epoch": 0.8926345248696095, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5165504217147827, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8777662515640259, + "num_tokens": 267601415.0, + "step": 7017 + }, + { + "epoch": 0.8927617351482, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.536073923110962, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.865624189376831, + "num_tokens": 267641333.0, + "step": 7018 + }, + { + "epoch": 0.8928889454267905, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.597805380821228, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8647927641868591, + "num_tokens": 267680730.0, + "step": 7019 + }, + { + "epoch": 0.893016155705381, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.6864298582077026, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8519952893257141, + "num_tokens": 267719179.0, + "step": 7020 + }, + { + "epoch": 0.8931433659839715, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5112468004226685, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8661211133003235, + "num_tokens": 267760394.0, + "step": 7021 + }, + { + "epoch": 0.893270576262562, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5464656352996826, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8650040626525879, + "num_tokens": 267799399.0, + "step": 7022 + }, + { + "epoch": 0.8933977865411525, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.7022689580917358, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8539156913757324, + "num_tokens": 267830936.0, + "step": 7023 + }, + { + "epoch": 0.8935249968197431, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5564886331558228, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8550378084182739, + "num_tokens": 267874764.0, + "step": 7024 + }, + { + "epoch": 0.8936522070983336, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.6421645879745483, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8622644543647766, + "num_tokens": 267915137.0, + "step": 7025 + }, + { + "epoch": 0.893779417376924, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.607676386833191, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8550630211830139, + "num_tokens": 267958328.0, + "step": 7026 + }, + { + "epoch": 0.8939066276555145, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.7816932201385498, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8677971363067627, + "num_tokens": 267992517.0, + "step": 7027 + }, + { + "epoch": 0.8940338379341051, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.6723036766052246, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8776432275772095, + "num_tokens": 268023133.0, + "step": 7028 + }, + { + "epoch": 0.8941610482126956, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5926717519760132, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8658291101455688, + "num_tokens": 268062935.0, + "step": 7029 + }, + { + "epoch": 0.8942882584912861, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5676428079605103, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.881010890007019, + "num_tokens": 268098815.0, + "step": 7030 + }, + { + "epoch": 0.8944154687698767, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5613726377487183, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8483824133872986, + "num_tokens": 268144256.0, + "step": 7031 + }, + { + "epoch": 0.8945426790484671, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.5361452102661133, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8663841485977173, + "num_tokens": 268184437.0, + "step": 7032 + }, + { + "epoch": 0.8946698893270576, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5132578611373901, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8581969738006592, + "num_tokens": 268227158.0, + "step": 7033 + }, + { + "epoch": 0.8947970996056481, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.4913959503173828, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8684464693069458, + "num_tokens": 268266360.0, + "step": 7034 + }, + { + "epoch": 0.8949243098842387, + "ewc_loss": 1.8358230590820312e-05, + "grad_norm": 1.7046875953674316, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8646071553230286, + "num_tokens": 268300760.0, + "step": 7035 + }, + { + "epoch": 0.8950515201628292, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6478321552276611, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8601840734481812, + "num_tokens": 268338766.0, + "step": 7036 + }, + { + "epoch": 0.8951787304414197, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6804054975509644, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8565851449966431, + "num_tokens": 268372013.0, + "step": 7037 + }, + { + "epoch": 0.8953059407200101, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5543208122253418, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8717100620269775, + "num_tokens": 268410851.0, + "step": 7038 + }, + { + "epoch": 0.8954331509986007, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5431686639785767, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.885478675365448, + "num_tokens": 268444816.0, + "step": 7039 + }, + { + "epoch": 0.8955603612771912, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6764992475509644, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8624149560928345, + "num_tokens": 268487640.0, + "step": 7040 + }, + { + "epoch": 0.8956875715557817, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.7724696397781372, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8739129304885864, + "num_tokens": 268520184.0, + "step": 7041 + }, + { + "epoch": 0.8958147818343722, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5686808824539185, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8725017309188843, + "num_tokens": 268557166.0, + "step": 7042 + }, + { + "epoch": 0.8959419921129628, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6600260734558105, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8598175048828125, + "num_tokens": 268593799.0, + "step": 7043 + }, + { + "epoch": 0.8960692023915532, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6148194074630737, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8618793487548828, + "num_tokens": 268632905.0, + "step": 7044 + }, + { + "epoch": 0.8961964126701437, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6801574230194092, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8700680136680603, + "num_tokens": 268667938.0, + "step": 7045 + }, + { + "epoch": 0.8963236229487342, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.625266194343567, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8815448880195618, + "num_tokens": 268699816.0, + "step": 7046 + }, + { + "epoch": 0.8964508332273248, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6075242757797241, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.859383761882782, + "num_tokens": 268738938.0, + "step": 7047 + }, + { + "epoch": 0.8965780435059153, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.708522081375122, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8437372446060181, + "num_tokens": 268779302.0, + "step": 7048 + }, + { + "epoch": 0.8967052537845058, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6965529918670654, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8817417621612549, + "num_tokens": 268812122.0, + "step": 7049 + }, + { + "epoch": 0.8968324640630962, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.7771413326263428, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8523357510566711, + "num_tokens": 268845696.0, + "step": 7050 + }, + { + "epoch": 0.8969596743416868, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.614477515220642, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8646275997161865, + "num_tokens": 268883102.0, + "step": 7051 + }, + { + "epoch": 0.8970868846202773, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5373553037643433, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8714208602905273, + "num_tokens": 268924043.0, + "step": 7052 + }, + { + "epoch": 0.8972140948988678, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5129557847976685, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.869592547416687, + "num_tokens": 268965955.0, + "step": 7053 + }, + { + "epoch": 0.8973413051774584, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6477441787719727, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8608275055885315, + "num_tokens": 269003374.0, + "step": 7054 + }, + { + "epoch": 0.8974685154560489, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5194426774978638, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8619445562362671, + "num_tokens": 269048203.0, + "step": 7055 + }, + { + "epoch": 0.8975957257346393, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.4287184476852417, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.885759711265564, + "num_tokens": 269085618.0, + "step": 7056 + }, + { + "epoch": 0.8977229360132298, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5604294538497925, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8455894589424133, + "num_tokens": 269125577.0, + "step": 7057 + }, + { + "epoch": 0.8978501462918204, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.4964981079101562, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8858118653297424, + "num_tokens": 269164009.0, + "step": 7058 + }, + { + "epoch": 0.8979773565704109, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.61946702003479, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8582371473312378, + "num_tokens": 269200727.0, + "step": 7059 + }, + { + "epoch": 0.8981045668490014, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6171822547912598, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8583037853240967, + "num_tokens": 269238940.0, + "step": 7060 + }, + { + "epoch": 0.898231777127592, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.532941460609436, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8615105152130127, + "num_tokens": 269275624.0, + "step": 7061 + }, + { + "epoch": 0.8983589874061825, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6701321601867676, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8676724433898926, + "num_tokens": 269310233.0, + "step": 7062 + }, + { + "epoch": 0.8984861976847729, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6358375549316406, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8607460260391235, + "num_tokens": 269348118.0, + "step": 7063 + }, + { + "epoch": 0.8986134079633634, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6282259225845337, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8576316833496094, + "num_tokens": 269386442.0, + "step": 7064 + }, + { + "epoch": 0.898740618241954, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5259947776794434, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8561170101165771, + "num_tokens": 269429148.0, + "step": 7065 + }, + { + "epoch": 0.8988678285205445, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6056880950927734, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8480529189109802, + "num_tokens": 269471187.0, + "step": 7066 + }, + { + "epoch": 0.898995038799135, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.492915153503418, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8529301881790161, + "num_tokens": 269512461.0, + "step": 7067 + }, + { + "epoch": 0.8991222490777255, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5210336446762085, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8725576996803284, + "num_tokens": 269548569.0, + "step": 7068 + }, + { + "epoch": 0.899249459356316, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5010877847671509, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.865838885307312, + "num_tokens": 269588950.0, + "step": 7069 + }, + { + "epoch": 0.8993766696349065, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.503989577293396, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8639582395553589, + "num_tokens": 269627313.0, + "step": 7070 + }, + { + "epoch": 0.899503879913497, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.4740160703659058, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8727256059646606, + "num_tokens": 269672773.0, + "step": 7071 + }, + { + "epoch": 0.8996310901920875, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.548527479171753, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8821408152580261, + "num_tokens": 269708954.0, + "step": 7072 + }, + { + "epoch": 0.8997583004706781, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.62846839427948, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.855426549911499, + "num_tokens": 269743050.0, + "step": 7073 + }, + { + "epoch": 0.8998855107492686, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5672791004180908, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8756648302078247, + "num_tokens": 269777327.0, + "step": 7074 + }, + { + "epoch": 0.900012721027859, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6630090475082397, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8505680561065674, + "num_tokens": 269816598.0, + "step": 7075 + }, + { + "epoch": 0.9001399313064495, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.3820264339447021, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8800594806671143, + "num_tokens": 269858332.0, + "step": 7076 + }, + { + "epoch": 0.9002671415850401, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5932621955871582, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.87345290184021, + "num_tokens": 269890797.0, + "step": 7077 + }, + { + "epoch": 0.9003943518636306, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6287176609039307, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8726671934127808, + "num_tokens": 269923366.0, + "step": 7078 + }, + { + "epoch": 0.9005215621422211, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6053447723388672, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8707419037818909, + "num_tokens": 269960179.0, + "step": 7079 + }, + { + "epoch": 0.9006487724208116, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.65202796459198, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8576719760894775, + "num_tokens": 269997369.0, + "step": 7080 + }, + { + "epoch": 0.9007759826994021, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5109789371490479, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8644004464149475, + "num_tokens": 270037072.0, + "step": 7081 + }, + { + "epoch": 0.9009031929779926, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5749704837799072, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8613044023513794, + "num_tokens": 270075107.0, + "step": 7082 + }, + { + "epoch": 0.9010304032565831, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6694788932800293, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8377211093902588, + "num_tokens": 270115806.0, + "step": 7083 + }, + { + "epoch": 0.9011576135351737, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.7046117782592773, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8494542241096497, + "num_tokens": 270151413.0, + "step": 7084 + }, + { + "epoch": 0.9012848238137642, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6153594255447388, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8416295647621155, + "num_tokens": 270191407.0, + "step": 7085 + }, + { + "epoch": 0.9014120340923547, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.8687469959259033, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8840159177780151, + "num_tokens": 270229068.0, + "step": 7086 + }, + { + "epoch": 0.9015392443709451, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.4956849813461304, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8733760118484497, + "num_tokens": 270268101.0, + "step": 7087 + }, + { + "epoch": 0.9016664546495357, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.659842610359192, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8595362901687622, + "num_tokens": 270303457.0, + "step": 7088 + }, + { + "epoch": 0.9017936649281262, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.7485374212265015, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8776313066482544, + "num_tokens": 270337723.0, + "step": 7089 + }, + { + "epoch": 0.9019208752067167, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.4635651111602783, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8714463710784912, + "num_tokens": 270380417.0, + "step": 7090 + }, + { + "epoch": 0.9020480854853072, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6338484287261963, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8650490045547485, + "num_tokens": 270421955.0, + "step": 7091 + }, + { + "epoch": 0.9021752957638978, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5189045667648315, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8635607957839966, + "num_tokens": 270460645.0, + "step": 7092 + }, + { + "epoch": 0.9023025060424882, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.5466787815093994, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8789627552032471, + "num_tokens": 270494087.0, + "step": 7093 + }, + { + "epoch": 0.9024297163210787, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.4899958372116089, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8765904903411865, + "num_tokens": 270533820.0, + "step": 7094 + }, + { + "epoch": 0.9025569265996692, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.3644485473632812, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8858494162559509, + "num_tokens": 270578248.0, + "step": 7095 + }, + { + "epoch": 0.9026841368782598, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.530564785003662, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.854581892490387, + "num_tokens": 270619448.0, + "step": 7096 + }, + { + "epoch": 0.9028113471568503, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.4704666137695312, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.874838650226593, + "num_tokens": 270659627.0, + "step": 7097 + }, + { + "epoch": 0.9029385574354408, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5089110136032104, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.872279942035675, + "num_tokens": 270698111.0, + "step": 7098 + }, + { + "epoch": 0.9030657677140312, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5025871992111206, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8709350824356079, + "num_tokens": 270735999.0, + "step": 7099 + }, + { + "epoch": 0.9031929779926218, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.7018424272537231, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8490785360336304, + "num_tokens": 270770045.0, + "step": 7100 + }, + { + "epoch": 0.9033201882712123, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.4842363595962524, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8696227073669434, + "num_tokens": 270813658.0, + "step": 7101 + }, + { + "epoch": 0.9034473985498028, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5944846868515015, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8598816394805908, + "num_tokens": 270851363.0, + "step": 7102 + }, + { + "epoch": 0.9035746088283934, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6074339151382446, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.864690363407135, + "num_tokens": 270886907.0, + "step": 7103 + }, + { + "epoch": 0.9037018191069839, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5944846868515015, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.858676552772522, + "num_tokens": 270922831.0, + "step": 7104 + }, + { + "epoch": 0.9038290293855743, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6298331022262573, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8774479031562805, + "num_tokens": 270957982.0, + "step": 7105 + }, + { + "epoch": 0.9039562396641648, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5163389444351196, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8569396734237671, + "num_tokens": 271003880.0, + "step": 7106 + }, + { + "epoch": 0.9040834499427554, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.8176958560943604, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8593388199806213, + "num_tokens": 271035133.0, + "step": 7107 + }, + { + "epoch": 0.9042106602213459, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.5170336961746216, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8562115430831909, + "num_tokens": 271080158.0, + "step": 7108 + }, + { + "epoch": 0.9043378704999364, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.442187786102295, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8754039406776428, + "num_tokens": 271125370.0, + "step": 7109 + }, + { + "epoch": 0.9044650807785269, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6417564153671265, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8702737092971802, + "num_tokens": 271161542.0, + "step": 7110 + }, + { + "epoch": 0.9045922910571175, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.575382113456726, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8632222414016724, + "num_tokens": 271198255.0, + "step": 7111 + }, + { + "epoch": 0.9047195013357079, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6678277254104614, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8594446182250977, + "num_tokens": 271234692.0, + "step": 7112 + }, + { + "epoch": 0.9048467116142984, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.50052809715271, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8785189390182495, + "num_tokens": 271273895.0, + "step": 7113 + }, + { + "epoch": 0.9049739218928889, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5930399894714355, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8699837327003479, + "num_tokens": 271308362.0, + "step": 7114 + }, + { + "epoch": 0.9051011321714795, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.5254231691360474, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8589766025543213, + "num_tokens": 271352163.0, + "step": 7115 + }, + { + "epoch": 0.90522834245007, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6552742719650269, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8658731579780579, + "num_tokens": 271390980.0, + "step": 7116 + }, + { + "epoch": 0.9053555527286605, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5100399255752563, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8773213624954224, + "num_tokens": 271430605.0, + "step": 7117 + }, + { + "epoch": 0.905482763007251, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6405041217803955, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.866389811038971, + "num_tokens": 271466811.0, + "step": 7118 + }, + { + "epoch": 0.9056099732858415, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5593781471252441, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8788965940475464, + "num_tokens": 271505931.0, + "step": 7119 + }, + { + "epoch": 0.905737183564432, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4949300289154053, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.874459981918335, + "num_tokens": 271544154.0, + "step": 7120 + }, + { + "epoch": 0.9058643938430225, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5124415159225464, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8614174127578735, + "num_tokens": 271585716.0, + "step": 7121 + }, + { + "epoch": 0.9059916041216131, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.741537094116211, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8462244272232056, + "num_tokens": 271623082.0, + "step": 7122 + }, + { + "epoch": 0.9061188144002036, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6756800413131714, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8495303392410278, + "num_tokens": 271657612.0, + "step": 7123 + }, + { + "epoch": 0.906246024678794, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5771822929382324, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8593673706054688, + "num_tokens": 271698109.0, + "step": 7124 + }, + { + "epoch": 0.9063732349573845, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6441938877105713, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8644570112228394, + "num_tokens": 271738506.0, + "step": 7125 + }, + { + "epoch": 0.9065004452359751, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 2.200693130493164, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8677729368209839, + "num_tokens": 271780911.0, + "step": 7126 + }, + { + "epoch": 0.9066276555145656, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.625701665878296, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8594794869422913, + "num_tokens": 271817362.0, + "step": 7127 + }, + { + "epoch": 0.9067548657931561, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7337234020233154, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8560750484466553, + "num_tokens": 271852694.0, + "step": 7128 + }, + { + "epoch": 0.9068820760717466, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6497606039047241, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8672349452972412, + "num_tokens": 271888254.0, + "step": 7129 + }, + { + "epoch": 0.9070092863503371, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6899290084838867, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.872104823589325, + "num_tokens": 271919000.0, + "step": 7130 + }, + { + "epoch": 0.9071364966289276, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.5008387565612793, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8585854768753052, + "num_tokens": 271961650.0, + "step": 7131 + }, + { + "epoch": 0.9072637069075181, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7177506685256958, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8597383499145508, + "num_tokens": 271999168.0, + "step": 7132 + }, + { + "epoch": 0.9073909171861086, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.586523175239563, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8639194965362549, + "num_tokens": 272038584.0, + "step": 7133 + }, + { + "epoch": 0.9075181274646992, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.8457973003387451, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8433589935302734, + "num_tokens": 272071584.0, + "step": 7134 + }, + { + "epoch": 0.9076453377432897, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.7079757452011108, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8568774461746216, + "num_tokens": 272107359.0, + "step": 7135 + }, + { + "epoch": 0.9077725480218801, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6608730554580688, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8730661273002625, + "num_tokens": 272142308.0, + "step": 7136 + }, + { + "epoch": 0.9078997583004706, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6554909944534302, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8377494812011719, + "num_tokens": 272180685.0, + "step": 7137 + }, + { + "epoch": 0.9080269685790612, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.5338932275772095, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8696884512901306, + "num_tokens": 272221323.0, + "step": 7138 + }, + { + "epoch": 0.9081541788576517, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6746289730072021, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8724470138549805, + "num_tokens": 272260654.0, + "step": 7139 + }, + { + "epoch": 0.9082813891362422, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4959490299224854, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8898724913597107, + "num_tokens": 272297724.0, + "step": 7140 + }, + { + "epoch": 0.9084085994148328, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6373395919799805, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8485201001167297, + "num_tokens": 272336898.0, + "step": 7141 + }, + { + "epoch": 0.9085358096934232, + "ewc_loss": 1.8596649169921875e-05, + "grad_norm": 1.6059582233428955, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8638287782669067, + "num_tokens": 272375484.0, + "step": 7142 + }, + { + "epoch": 0.9086630199720137, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5467783212661743, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8670826554298401, + "num_tokens": 272416943.0, + "step": 7143 + }, + { + "epoch": 0.9087902302506042, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.715970754623413, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8525609970092773, + "num_tokens": 272451553.0, + "step": 7144 + }, + { + "epoch": 0.9089174405291948, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.723982572555542, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8648264408111572, + "num_tokens": 272487318.0, + "step": 7145 + }, + { + "epoch": 0.9090446508077853, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.645194411277771, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8719116449356079, + "num_tokens": 272524139.0, + "step": 7146 + }, + { + "epoch": 0.9091718610863758, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6094014644622803, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8740547895431519, + "num_tokens": 272557712.0, + "step": 7147 + }, + { + "epoch": 0.9092990713649662, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5636529922485352, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8529253602027893, + "num_tokens": 272599051.0, + "step": 7148 + }, + { + "epoch": 0.9094262816435568, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5364335775375366, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8541662693023682, + "num_tokens": 272641987.0, + "step": 7149 + }, + { + "epoch": 0.9095534919221473, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5534473657608032, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8682918548583984, + "num_tokens": 272683181.0, + "step": 7150 + }, + { + "epoch": 0.9096807022007378, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.546269416809082, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8726279735565186, + "num_tokens": 272723340.0, + "step": 7151 + }, + { + "epoch": 0.9098079124793284, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.6543221473693848, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8487465381622314, + "num_tokens": 272766621.0, + "step": 7152 + }, + { + "epoch": 0.9099351227579189, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.723582148551941, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8712924122810364, + "num_tokens": 272801174.0, + "step": 7153 + }, + { + "epoch": 0.9100623330365093, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6424537897109985, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8695489168167114, + "num_tokens": 272837651.0, + "step": 7154 + }, + { + "epoch": 0.9101895433150998, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5610195398330688, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.859973669052124, + "num_tokens": 272878643.0, + "step": 7155 + }, + { + "epoch": 0.9103167535936904, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5759353637695312, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8628770112991333, + "num_tokens": 272916259.0, + "step": 7156 + }, + { + "epoch": 0.9104439638722809, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4540766477584839, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8598335385322571, + "num_tokens": 272963722.0, + "step": 7157 + }, + { + "epoch": 0.9105711741508714, + "ewc_loss": 1.8477439880371094e-05, + "grad_norm": 1.5293591022491455, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8733365535736084, + "num_tokens": 273002682.0, + "step": 7158 + }, + { + "epoch": 0.9106983844294619, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6297352313995361, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8621091842651367, + "num_tokens": 273040028.0, + "step": 7159 + }, + { + "epoch": 0.9108255947080524, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5978014469146729, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8694097995758057, + "num_tokens": 273075503.0, + "step": 7160 + }, + { + "epoch": 0.9109528049866429, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6251779794692993, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8659158945083618, + "num_tokens": 273107394.0, + "step": 7161 + }, + { + "epoch": 0.9110800152652334, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6688337326049805, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8675119876861572, + "num_tokens": 273137717.0, + "step": 7162 + }, + { + "epoch": 0.9112072255438239, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6688306331634521, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.872158944606781, + "num_tokens": 273168852.0, + "step": 7163 + }, + { + "epoch": 0.9113344358224145, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5571320056915283, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8696006536483765, + "num_tokens": 273206087.0, + "step": 7164 + }, + { + "epoch": 0.911461646101005, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6638034582138062, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8601065278053284, + "num_tokens": 273245200.0, + "step": 7165 + }, + { + "epoch": 0.9115888563795955, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.572636604309082, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8754013180732727, + "num_tokens": 273283828.0, + "step": 7166 + }, + { + "epoch": 0.9117160666581859, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7918671369552612, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8688833117485046, + "num_tokens": 273314620.0, + "step": 7167 + }, + { + "epoch": 0.9118432769367765, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5095160007476807, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8533939719200134, + "num_tokens": 273357445.0, + "step": 7168 + }, + { + "epoch": 0.911970487215367, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.514603614807129, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8699789047241211, + "num_tokens": 273397049.0, + "step": 7169 + }, + { + "epoch": 0.9120976974939575, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5305010080337524, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8516854643821716, + "num_tokens": 273440342.0, + "step": 7170 + }, + { + "epoch": 0.912224907772548, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4829761981964111, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8704469203948975, + "num_tokens": 273482739.0, + "step": 7171 + }, + { + "epoch": 0.9123521180511386, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.495837926864624, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8571829795837402, + "num_tokens": 273525637.0, + "step": 7172 + }, + { + "epoch": 0.912479328329729, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5853217840194702, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.859511137008667, + "num_tokens": 273564169.0, + "step": 7173 + }, + { + "epoch": 0.9126065386083195, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5110647678375244, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8562609553337097, + "num_tokens": 273606600.0, + "step": 7174 + }, + { + "epoch": 0.9127337488869101, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5159671306610107, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8713877201080322, + "num_tokens": 273645787.0, + "step": 7175 + }, + { + "epoch": 0.9128609591655006, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4393185377120972, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8811911344528198, + "num_tokens": 273687346.0, + "step": 7176 + }, + { + "epoch": 0.9129881694440911, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5295757055282593, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.873818039894104, + "num_tokens": 273723908.0, + "step": 7177 + }, + { + "epoch": 0.9131153797226816, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5909172296524048, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.856793224811554, + "num_tokens": 273763951.0, + "step": 7178 + }, + { + "epoch": 0.9132425900012721, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6584290266036987, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8690378665924072, + "num_tokens": 273800573.0, + "step": 7179 + }, + { + "epoch": 0.9133698002798626, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.543260931968689, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8638689517974854, + "num_tokens": 273840484.0, + "step": 7180 + }, + { + "epoch": 0.9134970105584531, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5337250232696533, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8600603938102722, + "num_tokens": 273882382.0, + "step": 7181 + }, + { + "epoch": 0.9136242208370436, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6648991107940674, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8564059734344482, + "num_tokens": 273921594.0, + "step": 7182 + }, + { + "epoch": 0.9137514311156342, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6147540807724, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8626665472984314, + "num_tokens": 273960732.0, + "step": 7183 + }, + { + "epoch": 0.9138786413942247, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.626824140548706, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8804057836532593, + "num_tokens": 273994907.0, + "step": 7184 + }, + { + "epoch": 0.9140058516728151, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5273867845535278, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8739596605300903, + "num_tokens": 274032171.0, + "step": 7185 + }, + { + "epoch": 0.9141330619514056, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4287354946136475, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8858219981193542, + "num_tokens": 274072532.0, + "step": 7186 + }, + { + "epoch": 0.9142602722299962, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6337169408798218, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8598605394363403, + "num_tokens": 274110449.0, + "step": 7187 + }, + { + "epoch": 0.9143874825085867, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5523875951766968, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8684741258621216, + "num_tokens": 274148719.0, + "step": 7188 + }, + { + "epoch": 0.9145146927871772, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.499159812927246, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8638702034950256, + "num_tokens": 274191531.0, + "step": 7189 + }, + { + "epoch": 0.9146419030657678, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6280286312103271, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8737552165985107, + "num_tokens": 274226797.0, + "step": 7190 + }, + { + "epoch": 0.9147691133443582, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5494464635849, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8708740472793579, + "num_tokens": 274261038.0, + "step": 7191 + }, + { + "epoch": 0.9148963236229487, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.499369502067566, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8811876177787781, + "num_tokens": 274299822.0, + "step": 7192 + }, + { + "epoch": 0.9150235339015392, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.600533127784729, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8775478005409241, + "num_tokens": 274331790.0, + "step": 7193 + }, + { + "epoch": 0.9151507441801298, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6401747465133667, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8761622905731201, + "num_tokens": 274370130.0, + "step": 7194 + }, + { + "epoch": 0.9152779544587203, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6286594867706299, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8789397478103638, + "num_tokens": 274408976.0, + "step": 7195 + }, + { + "epoch": 0.9154051647373108, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6460363864898682, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8559636473655701, + "num_tokens": 274443550.0, + "step": 7196 + }, + { + "epoch": 0.9155323750159012, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6646161079406738, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8684672117233276, + "num_tokens": 274476903.0, + "step": 7197 + }, + { + "epoch": 0.9156595852944918, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 4.636932849884033, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8743610978126526, + "num_tokens": 274510088.0, + "step": 7198 + }, + { + "epoch": 0.9157867955730823, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7461533546447754, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8676455616950989, + "num_tokens": 274549202.0, + "step": 7199 + }, + { + "epoch": 0.9159140058516728, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.9478405714035034, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8658511638641357, + "num_tokens": 274588975.0, + "step": 7200 + }, + { + "epoch": 0.9160412161302633, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6247401237487793, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8857220411300659, + "num_tokens": 274627245.0, + "step": 7201 + }, + { + "epoch": 0.9161684264088539, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6288347244262695, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8516172170639038, + "num_tokens": 274665450.0, + "step": 7202 + }, + { + "epoch": 0.9162956366874443, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6267805099487305, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8600434064865112, + "num_tokens": 274703692.0, + "step": 7203 + }, + { + "epoch": 0.9164228469660348, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7150367498397827, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.844074010848999, + "num_tokens": 274742559.0, + "step": 7204 + }, + { + "epoch": 0.9165500572446253, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6634547710418701, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.863079309463501, + "num_tokens": 274774414.0, + "step": 7205 + }, + { + "epoch": 0.9166772675232159, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.746435284614563, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8562739491462708, + "num_tokens": 274805909.0, + "step": 7206 + }, + { + "epoch": 0.9168044778018064, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5030601024627686, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.854729950428009, + "num_tokens": 274846176.0, + "step": 7207 + }, + { + "epoch": 0.9169316880803969, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.573087453842163, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8722707033157349, + "num_tokens": 274883067.0, + "step": 7208 + }, + { + "epoch": 0.9170588983589874, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7128316164016724, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8654273748397827, + "num_tokens": 274918672.0, + "step": 7209 + }, + { + "epoch": 0.9171861086375779, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5413740873336792, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.865824818611145, + "num_tokens": 274957680.0, + "step": 7210 + }, + { + "epoch": 0.9173133189161684, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.53168523311615, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8710858821868896, + "num_tokens": 274996630.0, + "step": 7211 + }, + { + "epoch": 0.9174405291947589, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5227237939834595, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8744961023330688, + "num_tokens": 275035868.0, + "step": 7212 + }, + { + "epoch": 0.9175677394733495, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6861323118209839, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8617761731147766, + "num_tokens": 275070608.0, + "step": 7213 + }, + { + "epoch": 0.91769494975194, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5119519233703613, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8802366256713867, + "num_tokens": 275112057.0, + "step": 7214 + }, + { + "epoch": 0.9178221600305305, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5444743633270264, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8742978572845459, + "num_tokens": 275150403.0, + "step": 7215 + }, + { + "epoch": 0.9179493703091209, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5896000862121582, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8602752089500427, + "num_tokens": 275189224.0, + "step": 7216 + }, + { + "epoch": 0.9180765805877115, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6995991468429565, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8619580268859863, + "num_tokens": 275229511.0, + "step": 7217 + }, + { + "epoch": 0.918203790866302, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6378451585769653, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8519495129585266, + "num_tokens": 275268954.0, + "step": 7218 + }, + { + "epoch": 0.9183310011448925, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7749555110931396, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8541041612625122, + "num_tokens": 275304762.0, + "step": 7219 + }, + { + "epoch": 0.918458211423483, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6438663005828857, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8548943996429443, + "num_tokens": 275341492.0, + "step": 7220 + }, + { + "epoch": 0.9185854217020736, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6746776103973389, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8675225973129272, + "num_tokens": 275383488.0, + "step": 7221 + }, + { + "epoch": 0.918712631980664, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5761842727661133, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8566835522651672, + "num_tokens": 275424424.0, + "step": 7222 + }, + { + "epoch": 0.9188398422592545, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5764739513397217, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8731482028961182, + "num_tokens": 275457787.0, + "step": 7223 + }, + { + "epoch": 0.918967052537845, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5599983930587769, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8551561832427979, + "num_tokens": 275500451.0, + "step": 7224 + }, + { + "epoch": 0.9190942628164356, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.582177758216858, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8701351881027222, + "num_tokens": 275538001.0, + "step": 7225 + }, + { + "epoch": 0.9192214730950261, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.638816237449646, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8667444586753845, + "num_tokens": 275577573.0, + "step": 7226 + }, + { + "epoch": 0.9193486833736166, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6568015813827515, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8610445261001587, + "num_tokens": 275614552.0, + "step": 7227 + }, + { + "epoch": 0.919475893652207, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6665509939193726, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.857267439365387, + "num_tokens": 275648878.0, + "step": 7228 + }, + { + "epoch": 0.9196031039307976, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5691492557525635, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.871390163898468, + "num_tokens": 275683207.0, + "step": 7229 + }, + { + "epoch": 0.9197303142093881, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7054436206817627, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8496490716934204, + "num_tokens": 275717110.0, + "step": 7230 + }, + { + "epoch": 0.9198575244879786, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6222670078277588, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8719503283500671, + "num_tokens": 275751969.0, + "step": 7231 + }, + { + "epoch": 0.9199847347665692, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4513059854507446, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8559848070144653, + "num_tokens": 275798995.0, + "step": 7232 + }, + { + "epoch": 0.9201119450451597, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5551954507827759, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8718290328979492, + "num_tokens": 275838744.0, + "step": 7233 + }, + { + "epoch": 0.9202391553237501, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5614352226257324, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8602758646011353, + "num_tokens": 275877951.0, + "step": 7234 + }, + { + "epoch": 0.9203663656023406, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6262073516845703, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8641331195831299, + "num_tokens": 275910446.0, + "step": 7235 + }, + { + "epoch": 0.9204935758809312, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5275179147720337, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8703333139419556, + "num_tokens": 275946847.0, + "step": 7236 + }, + { + "epoch": 0.9206207861595217, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5569020509719849, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8606967926025391, + "num_tokens": 275984634.0, + "step": 7237 + }, + { + "epoch": 0.9207479964381122, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5849746465682983, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8680046200752258, + "num_tokens": 276022846.0, + "step": 7238 + }, + { + "epoch": 0.9208752067167028, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6028283834457397, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8653401136398315, + "num_tokens": 276061693.0, + "step": 7239 + }, + { + "epoch": 0.9210024169952932, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6510896682739258, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8547968864440918, + "num_tokens": 276102602.0, + "step": 7240 + }, + { + "epoch": 0.9211296272738837, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6037147045135498, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8675913214683533, + "num_tokens": 276142328.0, + "step": 7241 + }, + { + "epoch": 0.9212568375524742, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4872337579727173, + "learning_rate": 1e-06, + "loss": 0.3244, + "mean_token_accuracy": 0.8940672278404236, + "num_tokens": 276180362.0, + "step": 7242 + }, + { + "epoch": 0.9213840478310648, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5511960983276367, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8633378744125366, + "num_tokens": 276217759.0, + "step": 7243 + }, + { + "epoch": 0.9215112581096553, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.549767255783081, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8593654632568359, + "num_tokens": 276259133.0, + "step": 7244 + }, + { + "epoch": 0.9216384683882458, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.525053858757019, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8623100519180298, + "num_tokens": 276305124.0, + "step": 7245 + }, + { + "epoch": 0.9217656786668362, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7057135105133057, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8607890605926514, + "num_tokens": 276340097.0, + "step": 7246 + }, + { + "epoch": 0.9218928889454268, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.773032307624817, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.857543408870697, + "num_tokens": 276373672.0, + "step": 7247 + }, + { + "epoch": 0.9220200992240173, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6157615184783936, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8681179881095886, + "num_tokens": 276413181.0, + "step": 7248 + }, + { + "epoch": 0.9221473095026078, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7280627489089966, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8592240810394287, + "num_tokens": 276447851.0, + "step": 7249 + }, + { + "epoch": 0.9222745197811983, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.489324927330017, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8620836734771729, + "num_tokens": 276493296.0, + "step": 7250 + }, + { + "epoch": 0.9224017300597889, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5520529747009277, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8805606961250305, + "num_tokens": 276528304.0, + "step": 7251 + }, + { + "epoch": 0.9225289403383793, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7626150846481323, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8565343022346497, + "num_tokens": 276558737.0, + "step": 7252 + }, + { + "epoch": 0.9226561506169698, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7786214351654053, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8611050248146057, + "num_tokens": 276591575.0, + "step": 7253 + }, + { + "epoch": 0.9227833608955603, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.727421522140503, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8614133596420288, + "num_tokens": 276626835.0, + "step": 7254 + }, + { + "epoch": 0.9229105711741509, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6866123676300049, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8710679411888123, + "num_tokens": 276664753.0, + "step": 7255 + }, + { + "epoch": 0.9230377814527414, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6559293270111084, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.858004093170166, + "num_tokens": 276704365.0, + "step": 7256 + }, + { + "epoch": 0.9231649917313319, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6496597528457642, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.85756915807724, + "num_tokens": 276742833.0, + "step": 7257 + }, + { + "epoch": 0.9232922020099223, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.589508056640625, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8736865520477295, + "num_tokens": 276777954.0, + "step": 7258 + }, + { + "epoch": 0.9234194122885129, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.614889144897461, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8581496477127075, + "num_tokens": 276815660.0, + "step": 7259 + }, + { + "epoch": 0.9235466225671034, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7555772066116333, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8758846521377563, + "num_tokens": 276847255.0, + "step": 7260 + }, + { + "epoch": 0.9236738328456939, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.711899757385254, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8502779006958008, + "num_tokens": 276884644.0, + "step": 7261 + }, + { + "epoch": 0.9238010431242845, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6329463720321655, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.864328145980835, + "num_tokens": 276921169.0, + "step": 7262 + }, + { + "epoch": 0.923928253402875, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4718788862228394, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8724839687347412, + "num_tokens": 276963291.0, + "step": 7263 + }, + { + "epoch": 0.9240554636814655, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6767158508300781, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8587632179260254, + "num_tokens": 276997220.0, + "step": 7264 + }, + { + "epoch": 0.9241826739600559, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4803720712661743, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8492693901062012, + "num_tokens": 277037606.0, + "step": 7265 + }, + { + "epoch": 0.9243098842386465, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.377884030342102, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8829421401023865, + "num_tokens": 277084546.0, + "step": 7266 + }, + { + "epoch": 0.924437094517237, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6515247821807861, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8473669290542603, + "num_tokens": 277120704.0, + "step": 7267 + }, + { + "epoch": 0.9245643047958275, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5433951616287231, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8702623844146729, + "num_tokens": 277158646.0, + "step": 7268 + }, + { + "epoch": 0.924691515074418, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.7556977272033691, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8482952117919922, + "num_tokens": 277189931.0, + "step": 7269 + }, + { + "epoch": 0.9248187253530086, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.491571307182312, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8809689283370972, + "num_tokens": 277226410.0, + "step": 7270 + }, + { + "epoch": 0.924945935631599, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4433366060256958, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.864556074142456, + "num_tokens": 277270111.0, + "step": 7271 + }, + { + "epoch": 0.9250731459101895, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5213701725006104, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8702571392059326, + "num_tokens": 277307906.0, + "step": 7272 + }, + { + "epoch": 0.92520035618878, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5644279718399048, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8603576421737671, + "num_tokens": 277346353.0, + "step": 7273 + }, + { + "epoch": 0.9253275664673706, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6618481874465942, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8738633394241333, + "num_tokens": 277381063.0, + "step": 7274 + }, + { + "epoch": 0.9254547767459611, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5574793815612793, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8609379529953003, + "num_tokens": 277422043.0, + "step": 7275 + }, + { + "epoch": 0.9255819870245516, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5361377000808716, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8670493364334106, + "num_tokens": 277461539.0, + "step": 7276 + }, + { + "epoch": 0.925709197303142, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.686805248260498, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8659312725067139, + "num_tokens": 277493695.0, + "step": 7277 + }, + { + "epoch": 0.9258364075817326, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6637732982635498, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8698254823684692, + "num_tokens": 277527848.0, + "step": 7278 + }, + { + "epoch": 0.9259636178603231, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6042736768722534, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8641111850738525, + "num_tokens": 277564884.0, + "step": 7279 + }, + { + "epoch": 0.9260908281389136, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.670180082321167, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8504563570022583, + "num_tokens": 277601161.0, + "step": 7280 + }, + { + "epoch": 0.9262180384175042, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.875785231590271, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8701061010360718, + "num_tokens": 277639446.0, + "step": 7281 + }, + { + "epoch": 0.9263452486960947, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.533957839012146, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8642947673797607, + "num_tokens": 277677017.0, + "step": 7282 + }, + { + "epoch": 0.9264724589746851, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4290300607681274, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.878106951713562, + "num_tokens": 277721306.0, + "step": 7283 + }, + { + "epoch": 0.9265996692532756, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4560226202011108, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8555852174758911, + "num_tokens": 277765360.0, + "step": 7284 + }, + { + "epoch": 0.9267268795318662, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.544281244277954, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8739793300628662, + "num_tokens": 277803244.0, + "step": 7285 + }, + { + "epoch": 0.9268540898104567, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5395222902297974, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8584110736846924, + "num_tokens": 277846238.0, + "step": 7286 + }, + { + "epoch": 0.9269813000890472, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5844379663467407, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8690597414970398, + "num_tokens": 277880878.0, + "step": 7287 + }, + { + "epoch": 0.9271085103676378, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6516335010528564, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8723444938659668, + "num_tokens": 277916061.0, + "step": 7288 + }, + { + "epoch": 0.9272357206462282, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5338374376296997, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8572176694869995, + "num_tokens": 277956613.0, + "step": 7289 + }, + { + "epoch": 0.9273629309248187, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5196428298950195, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8647423982620239, + "num_tokens": 277998851.0, + "step": 7290 + }, + { + "epoch": 0.9274901412034092, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.4588457345962524, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8786733150482178, + "num_tokens": 278039441.0, + "step": 7291 + }, + { + "epoch": 0.9276173514819998, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.469549298286438, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8696323037147522, + "num_tokens": 278079113.0, + "step": 7292 + }, + { + "epoch": 0.9277445617605903, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.616685390472412, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8631848096847534, + "num_tokens": 278121322.0, + "step": 7293 + }, + { + "epoch": 0.9278717720391808, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5855239629745483, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8773291707038879, + "num_tokens": 278157139.0, + "step": 7294 + }, + { + "epoch": 0.9279989823177712, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.646270513534546, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8605660200119019, + "num_tokens": 278189552.0, + "step": 7295 + }, + { + "epoch": 0.9281261925963618, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.601161003112793, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8602712154388428, + "num_tokens": 278223827.0, + "step": 7296 + }, + { + "epoch": 0.9282534028749523, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6491775512695312, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8654634952545166, + "num_tokens": 278259457.0, + "step": 7297 + }, + { + "epoch": 0.9283806131535428, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.670795202255249, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8731719255447388, + "num_tokens": 278295302.0, + "step": 7298 + }, + { + "epoch": 0.9285078234321333, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.552526593208313, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.862558126449585, + "num_tokens": 278335762.0, + "step": 7299 + }, + { + "epoch": 0.9286350337107239, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.51726496219635, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8684004545211792, + "num_tokens": 278376378.0, + "step": 7300 + }, + { + "epoch": 0.9287622439893143, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5839194059371948, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8543344140052795, + "num_tokens": 278417145.0, + "step": 7301 + }, + { + "epoch": 0.9288894542679048, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.6299967765808105, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8554433584213257, + "num_tokens": 278452692.0, + "step": 7302 + }, + { + "epoch": 0.9290166645464953, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5707523822784424, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8613301515579224, + "num_tokens": 278490054.0, + "step": 7303 + }, + { + "epoch": 0.9291438748250859, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.566968560218811, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8668636083602905, + "num_tokens": 278528428.0, + "step": 7304 + }, + { + "epoch": 0.9292710851036764, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5446326732635498, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8512818217277527, + "num_tokens": 278567382.0, + "step": 7305 + }, + { + "epoch": 0.9293982953822669, + "ewc_loss": 1.8715858459472656e-05, + "grad_norm": 1.5480167865753174, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8569216728210449, + "num_tokens": 278607862.0, + "step": 7306 + }, + { + "epoch": 0.9295255056608573, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.769534945487976, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8446702361106873, + "num_tokens": 278644457.0, + "step": 7307 + }, + { + "epoch": 0.9296527159394479, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5075653791427612, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8871344327926636, + "num_tokens": 278681327.0, + "step": 7308 + }, + { + "epoch": 0.9297799262180384, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7980635166168213, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8642730712890625, + "num_tokens": 278716695.0, + "step": 7309 + }, + { + "epoch": 0.9299071364966289, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7132619619369507, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8591952323913574, + "num_tokens": 278754741.0, + "step": 7310 + }, + { + "epoch": 0.9300343467752195, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4703468084335327, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8778375387191772, + "num_tokens": 278796720.0, + "step": 7311 + }, + { + "epoch": 0.93016155705381, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.690603256225586, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8695306777954102, + "num_tokens": 278830436.0, + "step": 7312 + }, + { + "epoch": 0.9302887673324005, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7848857641220093, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8450909852981567, + "num_tokens": 278864454.0, + "step": 7313 + }, + { + "epoch": 0.9304159776109909, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.632385015487671, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8680437803268433, + "num_tokens": 278905072.0, + "step": 7314 + }, + { + "epoch": 0.9305431878895815, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5367809534072876, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8494016528129578, + "num_tokens": 278948228.0, + "step": 7315 + }, + { + "epoch": 0.930670398168172, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5804415941238403, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8572267293930054, + "num_tokens": 278989583.0, + "step": 7316 + }, + { + "epoch": 0.9307976084467625, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.3948500156402588, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8697161674499512, + "num_tokens": 279034793.0, + "step": 7317 + }, + { + "epoch": 0.930924818725353, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.521091341972351, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8631808161735535, + "num_tokens": 279073551.0, + "step": 7318 + }, + { + "epoch": 0.9310520290039436, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6645886898040771, + "learning_rate": 1e-06, + "loss": 0.5179, + "mean_token_accuracy": 0.8366337418556213, + "num_tokens": 279115343.0, + "step": 7319 + }, + { + "epoch": 0.931179239282534, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5358104705810547, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8874841332435608, + "num_tokens": 279150476.0, + "step": 7320 + }, + { + "epoch": 0.9313064495611245, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6153240203857422, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8427009582519531, + "num_tokens": 279192169.0, + "step": 7321 + }, + { + "epoch": 0.931433659839715, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5509215593338013, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8676427602767944, + "num_tokens": 279230656.0, + "step": 7322 + }, + { + "epoch": 0.9315608701183056, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4802477359771729, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.878943920135498, + "num_tokens": 279268894.0, + "step": 7323 + }, + { + "epoch": 0.9316880803968961, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.613062858581543, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.85731041431427, + "num_tokens": 279308050.0, + "step": 7324 + }, + { + "epoch": 0.9318152906754866, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6499663591384888, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8600812554359436, + "num_tokens": 279340323.0, + "step": 7325 + }, + { + "epoch": 0.931942500954077, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6575981378555298, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8681862950325012, + "num_tokens": 279373002.0, + "step": 7326 + }, + { + "epoch": 0.9320697112326676, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6147555112838745, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8730685710906982, + "num_tokens": 279408511.0, + "step": 7327 + }, + { + "epoch": 0.9321969215112581, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5156155824661255, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.874330997467041, + "num_tokens": 279448066.0, + "step": 7328 + }, + { + "epoch": 0.9323241317898486, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6151930093765259, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8543608784675598, + "num_tokens": 279493332.0, + "step": 7329 + }, + { + "epoch": 0.9324513420684392, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.618079662322998, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8635366559028625, + "num_tokens": 279533944.0, + "step": 7330 + }, + { + "epoch": 0.9325785523470297, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6334868669509888, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8652894496917725, + "num_tokens": 279571950.0, + "step": 7331 + }, + { + "epoch": 0.9327057626256201, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.610376000404358, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8642270565032959, + "num_tokens": 279609275.0, + "step": 7332 + }, + { + "epoch": 0.9328329729042106, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7412163019180298, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8629105091094971, + "num_tokens": 279643939.0, + "step": 7333 + }, + { + "epoch": 0.9329601831828012, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.667134165763855, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8556445837020874, + "num_tokens": 279683857.0, + "step": 7334 + }, + { + "epoch": 0.9330873934613917, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5441399812698364, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8786396384239197, + "num_tokens": 279720819.0, + "step": 7335 + }, + { + "epoch": 0.9332146037399822, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.561727523803711, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8801999688148499, + "num_tokens": 279753732.0, + "step": 7336 + }, + { + "epoch": 0.9333418140185727, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.529794454574585, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8601039052009583, + "num_tokens": 279794964.0, + "step": 7337 + }, + { + "epoch": 0.9334690242971632, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6310228109359741, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8555535078048706, + "num_tokens": 279832249.0, + "step": 7338 + }, + { + "epoch": 0.9335962345757537, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6881695985794067, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8531259298324585, + "num_tokens": 279870673.0, + "step": 7339 + }, + { + "epoch": 0.9337234448543442, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6847871541976929, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8557183146476746, + "num_tokens": 279911980.0, + "step": 7340 + }, + { + "epoch": 0.9338506551329347, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7750858068466187, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8830442428588867, + "num_tokens": 279945583.0, + "step": 7341 + }, + { + "epoch": 0.9339778654115253, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.587790608406067, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8671473264694214, + "num_tokens": 279987882.0, + "step": 7342 + }, + { + "epoch": 0.9341050756901158, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.542089819908142, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8700326085090637, + "num_tokens": 280030027.0, + "step": 7343 + }, + { + "epoch": 0.9342322859687062, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4654706716537476, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.875853419303894, + "num_tokens": 280072736.0, + "step": 7344 + }, + { + "epoch": 0.9343594962472968, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5502110719680786, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8506352305412292, + "num_tokens": 280117389.0, + "step": 7345 + }, + { + "epoch": 0.9344867065258873, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4954570531845093, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8743315935134888, + "num_tokens": 280158098.0, + "step": 7346 + }, + { + "epoch": 0.9346139168044778, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7941229343414307, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8536482453346252, + "num_tokens": 280192669.0, + "step": 7347 + }, + { + "epoch": 0.9347411270830683, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.534147024154663, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8628362417221069, + "num_tokens": 280230507.0, + "step": 7348 + }, + { + "epoch": 0.9348683373616589, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5552815198898315, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8643823266029358, + "num_tokens": 280267336.0, + "step": 7349 + }, + { + "epoch": 0.9349955476402493, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5788453817367554, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8629190325737, + "num_tokens": 280309001.0, + "step": 7350 + }, + { + "epoch": 0.9351227579188398, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7185477018356323, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8519513607025146, + "num_tokens": 280343315.0, + "step": 7351 + }, + { + "epoch": 0.9352499681974303, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5789047479629517, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8681304454803467, + "num_tokens": 280381793.0, + "step": 7352 + }, + { + "epoch": 0.9353771784760209, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7054893970489502, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8464939594268799, + "num_tokens": 280419093.0, + "step": 7353 + }, + { + "epoch": 0.9355043887546114, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7630127668380737, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8639540672302246, + "num_tokens": 280452152.0, + "step": 7354 + }, + { + "epoch": 0.9356315990332019, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7418346405029297, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8639349937438965, + "num_tokens": 280483731.0, + "step": 7355 + }, + { + "epoch": 0.9357588093117923, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6241421699523926, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8758687376976013, + "num_tokens": 280523286.0, + "step": 7356 + }, + { + "epoch": 0.9358860195903829, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5017595291137695, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8693022727966309, + "num_tokens": 280562195.0, + "step": 7357 + }, + { + "epoch": 0.9360132298689734, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.593583583831787, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8757822513580322, + "num_tokens": 280595633.0, + "step": 7358 + }, + { + "epoch": 0.9361404401475639, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5669234991073608, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8594556450843811, + "num_tokens": 280634472.0, + "step": 7359 + }, + { + "epoch": 0.9362676504261545, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6023492813110352, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8562484979629517, + "num_tokens": 280673553.0, + "step": 7360 + }, + { + "epoch": 0.936394860704745, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5820344686508179, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8904150724411011, + "num_tokens": 280705481.0, + "step": 7361 + }, + { + "epoch": 0.9365220709833355, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5139639377593994, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8735672235488892, + "num_tokens": 280746894.0, + "step": 7362 + }, + { + "epoch": 0.9366492812619259, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.524229645729065, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8670358657836914, + "num_tokens": 280790759.0, + "step": 7363 + }, + { + "epoch": 0.9367764915405165, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.515928864479065, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8783175349235535, + "num_tokens": 280831137.0, + "step": 7364 + }, + { + "epoch": 0.936903701819107, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5846292972564697, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8693820238113403, + "num_tokens": 280870391.0, + "step": 7365 + }, + { + "epoch": 0.9370309120976975, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5746368169784546, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8701352477073669, + "num_tokens": 280910707.0, + "step": 7366 + }, + { + "epoch": 0.937158122376288, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5545947551727295, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8684258460998535, + "num_tokens": 280948330.0, + "step": 7367 + }, + { + "epoch": 0.9372853326548786, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6105674505233765, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8677560091018677, + "num_tokens": 280987432.0, + "step": 7368 + }, + { + "epoch": 0.937412542933469, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5878384113311768, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8611442446708679, + "num_tokens": 281025392.0, + "step": 7369 + }, + { + "epoch": 0.9375397532120595, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.482199788093567, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8705149292945862, + "num_tokens": 281066996.0, + "step": 7370 + }, + { + "epoch": 0.93766696349065, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.49354887008667, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8701510429382324, + "num_tokens": 281111668.0, + "step": 7371 + }, + { + "epoch": 0.9377941737692406, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6361863613128662, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8663735389709473, + "num_tokens": 281149337.0, + "step": 7372 + }, + { + "epoch": 0.9379213840478311, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7224297523498535, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8742810487747192, + "num_tokens": 281182441.0, + "step": 7373 + }, + { + "epoch": 0.9380485943264216, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5361846685409546, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8632256984710693, + "num_tokens": 281224894.0, + "step": 7374 + }, + { + "epoch": 0.938175804605012, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6335537433624268, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8456794023513794, + "num_tokens": 281266087.0, + "step": 7375 + }, + { + "epoch": 0.9383030148836026, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4745713472366333, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8633803725242615, + "num_tokens": 281309568.0, + "step": 7376 + }, + { + "epoch": 0.9384302251621931, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7032288312911987, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8377217054367065, + "num_tokens": 281350128.0, + "step": 7377 + }, + { + "epoch": 0.9385574354407836, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6476811170578003, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8503636121749878, + "num_tokens": 281389063.0, + "step": 7378 + }, + { + "epoch": 0.9386846457193742, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6267184019088745, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8725319504737854, + "num_tokens": 281426945.0, + "step": 7379 + }, + { + "epoch": 0.9388118559979647, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4850564002990723, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8659579753875732, + "num_tokens": 281469009.0, + "step": 7380 + }, + { + "epoch": 0.9389390662765551, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5029597282409668, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8680617809295654, + "num_tokens": 281506347.0, + "step": 7381 + }, + { + "epoch": 0.9390662765551456, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7742787599563599, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8784031867980957, + "num_tokens": 281539266.0, + "step": 7382 + }, + { + "epoch": 0.9391934868337362, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5743464231491089, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8534500598907471, + "num_tokens": 281581421.0, + "step": 7383 + }, + { + "epoch": 0.9393206971123267, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5233708620071411, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.87090665102005, + "num_tokens": 281618602.0, + "step": 7384 + }, + { + "epoch": 0.9394479073909172, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7389295101165771, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8646993637084961, + "num_tokens": 281651514.0, + "step": 7385 + }, + { + "epoch": 0.9395751176695077, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.615959644317627, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8870352506637573, + "num_tokens": 281684308.0, + "step": 7386 + }, + { + "epoch": 0.9397023279480982, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.442652940750122, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8586870431900024, + "num_tokens": 281724478.0, + "step": 7387 + }, + { + "epoch": 0.9398295382266887, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.514815330505371, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8655134439468384, + "num_tokens": 281766055.0, + "step": 7388 + }, + { + "epoch": 0.9399567485052792, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5581384897232056, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.856766402721405, + "num_tokens": 281803692.0, + "step": 7389 + }, + { + "epoch": 0.9400839587838697, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6789839267730713, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8753225207328796, + "num_tokens": 281842525.0, + "step": 7390 + }, + { + "epoch": 0.9402111690624603, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.52352774143219, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8742716908454895, + "num_tokens": 281881730.0, + "step": 7391 + }, + { + "epoch": 0.9403383793410508, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6836429834365845, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8676242828369141, + "num_tokens": 281919119.0, + "step": 7392 + }, + { + "epoch": 0.9404655896196412, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5746171474456787, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8443326354026794, + "num_tokens": 281959016.0, + "step": 7393 + }, + { + "epoch": 0.9405927998982317, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.716120719909668, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8745235204696655, + "num_tokens": 281989105.0, + "step": 7394 + }, + { + "epoch": 0.9407200101768223, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5093415975570679, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.865042507648468, + "num_tokens": 282030294.0, + "step": 7395 + }, + { + "epoch": 0.9408472204554128, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4521358013153076, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8667894601821899, + "num_tokens": 282072253.0, + "step": 7396 + }, + { + "epoch": 0.9409744307340033, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5645239353179932, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8640612363815308, + "num_tokens": 282110695.0, + "step": 7397 + }, + { + "epoch": 0.9411016410125939, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5551965236663818, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8750503063201904, + "num_tokens": 282149782.0, + "step": 7398 + }, + { + "epoch": 0.9412288512911843, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.643920660018921, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.868104875087738, + "num_tokens": 282187036.0, + "step": 7399 + }, + { + "epoch": 0.9413560615697748, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6513874530792236, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8683885335922241, + "num_tokens": 282224450.0, + "step": 7400 + }, + { + "epoch": 0.9414832718483653, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.55644690990448, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8489974737167358, + "num_tokens": 282267145.0, + "step": 7401 + }, + { + "epoch": 0.9416104821269559, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6519802808761597, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.846956729888916, + "num_tokens": 282305507.0, + "step": 7402 + }, + { + "epoch": 0.9417376924055464, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6569539308547974, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8554829359054565, + "num_tokens": 282341637.0, + "step": 7403 + }, + { + "epoch": 0.9418649026841369, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6225506067276, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8501679301261902, + "num_tokens": 282381743.0, + "step": 7404 + }, + { + "epoch": 0.9419921129627273, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6750293970108032, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8595657348632812, + "num_tokens": 282415887.0, + "step": 7405 + }, + { + "epoch": 0.9421193232413179, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4320920705795288, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8809051513671875, + "num_tokens": 282455356.0, + "step": 7406 + }, + { + "epoch": 0.9422465335199084, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.677078127861023, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8683135509490967, + "num_tokens": 282492674.0, + "step": 7407 + }, + { + "epoch": 0.9423737437984989, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5599124431610107, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8720629215240479, + "num_tokens": 282535863.0, + "step": 7408 + }, + { + "epoch": 0.9425009540770894, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6362292766571045, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.850294291973114, + "num_tokens": 282576371.0, + "step": 7409 + }, + { + "epoch": 0.94262816435568, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5123134851455688, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8779977560043335, + "num_tokens": 282612978.0, + "step": 7410 + }, + { + "epoch": 0.9427553746342705, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5934301614761353, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8692011833190918, + "num_tokens": 282648458.0, + "step": 7411 + }, + { + "epoch": 0.9428825849128609, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5461490154266357, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8607004880905151, + "num_tokens": 282690724.0, + "step": 7412 + }, + { + "epoch": 0.9430097951914514, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6116251945495605, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8605915904045105, + "num_tokens": 282730817.0, + "step": 7413 + }, + { + "epoch": 0.943137005470042, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5674264430999756, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8590819239616394, + "num_tokens": 282775468.0, + "step": 7414 + }, + { + "epoch": 0.9432642157486325, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5956950187683105, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8657392263412476, + "num_tokens": 282815415.0, + "step": 7415 + }, + { + "epoch": 0.943391426027223, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6046110391616821, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8566734790802002, + "num_tokens": 282855691.0, + "step": 7416 + }, + { + "epoch": 0.9435186363058136, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.502875804901123, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8754610419273376, + "num_tokens": 282897134.0, + "step": 7417 + }, + { + "epoch": 0.943645846584404, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.7434626817703247, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8710391521453857, + "num_tokens": 282933178.0, + "step": 7418 + }, + { + "epoch": 0.9437730568629945, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.4784852266311646, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8717281222343445, + "num_tokens": 282974471.0, + "step": 7419 + }, + { + "epoch": 0.943900267141585, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.467407464981079, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.860358715057373, + "num_tokens": 283024014.0, + "step": 7420 + }, + { + "epoch": 0.9440274774201756, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5605430603027344, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8584487438201904, + "num_tokens": 283070047.0, + "step": 7421 + }, + { + "epoch": 0.9441546876987661, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6232209205627441, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8739808797836304, + "num_tokens": 283107782.0, + "step": 7422 + }, + { + "epoch": 0.9442818979773566, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.649452805519104, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8627937436103821, + "num_tokens": 283141543.0, + "step": 7423 + }, + { + "epoch": 0.944409108255947, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.4713218212127686, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8631386160850525, + "num_tokens": 283182595.0, + "step": 7424 + }, + { + "epoch": 0.9445363185345376, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.7055528163909912, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8691457509994507, + "num_tokens": 283213811.0, + "step": 7425 + }, + { + "epoch": 0.9446635288131281, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.645249843597412, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8649727702140808, + "num_tokens": 283249054.0, + "step": 7426 + }, + { + "epoch": 0.9447907390917186, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5658555030822754, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8638967871665955, + "num_tokens": 283285746.0, + "step": 7427 + }, + { + "epoch": 0.9449179493703092, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6275832653045654, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8688175082206726, + "num_tokens": 283318958.0, + "step": 7428 + }, + { + "epoch": 0.9450451596488997, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.554235816001892, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.860576868057251, + "num_tokens": 283361251.0, + "step": 7429 + }, + { + "epoch": 0.9451723699274901, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6303212642669678, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8585615754127502, + "num_tokens": 283399063.0, + "step": 7430 + }, + { + "epoch": 0.9452995802060806, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5433512926101685, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8669402599334717, + "num_tokens": 283437524.0, + "step": 7431 + }, + { + "epoch": 0.9454267904846712, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.7479125261306763, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8772059679031372, + "num_tokens": 283467958.0, + "step": 7432 + }, + { + "epoch": 0.9455540007632617, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.4769513607025146, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8799436688423157, + "num_tokens": 283505478.0, + "step": 7433 + }, + { + "epoch": 0.9456812110418522, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6478123664855957, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8526986837387085, + "num_tokens": 283540019.0, + "step": 7434 + }, + { + "epoch": 0.9458084213204427, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5965466499328613, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8606427907943726, + "num_tokens": 283578966.0, + "step": 7435 + }, + { + "epoch": 0.9459356315990332, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.499487042427063, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8570240139961243, + "num_tokens": 283623715.0, + "step": 7436 + }, + { + "epoch": 0.9460628418776237, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.568734049797058, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8678267598152161, + "num_tokens": 283661094.0, + "step": 7437 + }, + { + "epoch": 0.9461900521562142, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5256482362747192, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8663920164108276, + "num_tokens": 283702566.0, + "step": 7438 + }, + { + "epoch": 0.9463172624348047, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.4967894554138184, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8710218071937561, + "num_tokens": 283747720.0, + "step": 7439 + }, + { + "epoch": 0.9464444727133953, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.4999176263809204, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8512003421783447, + "num_tokens": 283793144.0, + "step": 7440 + }, + { + "epoch": 0.9465716829919858, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6937355995178223, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8722531795501709, + "num_tokens": 283822570.0, + "step": 7441 + }, + { + "epoch": 0.9466988932705762, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5652974843978882, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8632157444953918, + "num_tokens": 283864451.0, + "step": 7442 + }, + { + "epoch": 0.9468261035491667, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.5809040069580078, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8700742721557617, + "num_tokens": 283900056.0, + "step": 7443 + }, + { + "epoch": 0.9469533138277573, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.6520535945892334, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8716115951538086, + "num_tokens": 283935395.0, + "step": 7444 + }, + { + "epoch": 0.9470805241063478, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.7547757625579834, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8471794128417969, + "num_tokens": 283969354.0, + "step": 7445 + }, + { + "epoch": 0.9472077343849383, + "ewc_loss": 1.8835067749023438e-05, + "grad_norm": 1.750208854675293, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8747598528862, + "num_tokens": 284001761.0, + "step": 7446 + }, + { + "epoch": 0.9473349446635289, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.5720826387405396, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8652547597885132, + "num_tokens": 284036934.0, + "step": 7447 + }, + { + "epoch": 0.9474621549421193, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.5576224327087402, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8604577779769897, + "num_tokens": 284079432.0, + "step": 7448 + }, + { + "epoch": 0.9475893652207098, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.4118244647979736, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8703235983848572, + "num_tokens": 284127741.0, + "step": 7449 + }, + { + "epoch": 0.9477165754993003, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.7558186054229736, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8682632446289062, + "num_tokens": 284158733.0, + "step": 7450 + }, + { + "epoch": 0.9478437857778909, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.6323603391647339, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8498138785362244, + "num_tokens": 284200553.0, + "step": 7451 + }, + { + "epoch": 0.9479709960564814, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.5140366554260254, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8795408010482788, + "num_tokens": 284237499.0, + "step": 7452 + }, + { + "epoch": 0.9480982063350719, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.518623948097229, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8567901849746704, + "num_tokens": 284277482.0, + "step": 7453 + }, + { + "epoch": 0.9482254166136623, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.6573803424835205, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8618606328964233, + "num_tokens": 284315382.0, + "step": 7454 + }, + { + "epoch": 0.9483526268922529, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.5595972537994385, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8728833794593811, + "num_tokens": 284350340.0, + "step": 7455 + }, + { + "epoch": 0.9484798371708434, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.6437565088272095, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.849541425704956, + "num_tokens": 284393525.0, + "step": 7456 + }, + { + "epoch": 0.9486070474494339, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.6095435619354248, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8658602237701416, + "num_tokens": 284431028.0, + "step": 7457 + }, + { + "epoch": 0.9487342577280244, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.5572509765625, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8711923956871033, + "num_tokens": 284469482.0, + "step": 7458 + }, + { + "epoch": 0.948861468006615, + "ewc_loss": 1.895427703857422e-05, + "grad_norm": 1.7450224161148071, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8622488975524902, + "num_tokens": 284499919.0, + "step": 7459 + }, + { + "epoch": 0.9489886782852054, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5881011486053467, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8778209686279297, + "num_tokens": 284533435.0, + "step": 7460 + }, + { + "epoch": 0.9491158885637959, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.752976417541504, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8649961948394775, + "num_tokens": 284564743.0, + "step": 7461 + }, + { + "epoch": 0.9492430988423864, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6457687616348267, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8792584538459778, + "num_tokens": 284597705.0, + "step": 7462 + }, + { + "epoch": 0.949370309120977, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7236131429672241, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8640261888504028, + "num_tokens": 284631366.0, + "step": 7463 + }, + { + "epoch": 0.9494975193995675, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.465620517730713, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8740512728691101, + "num_tokens": 284674772.0, + "step": 7464 + }, + { + "epoch": 0.949624729678158, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6159775257110596, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8687111139297485, + "num_tokens": 284708811.0, + "step": 7465 + }, + { + "epoch": 0.9497519399567486, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.493328332901001, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.873029351234436, + "num_tokens": 284752787.0, + "step": 7466 + }, + { + "epoch": 0.949879150235339, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5786371231079102, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.856299638748169, + "num_tokens": 284794231.0, + "step": 7467 + }, + { + "epoch": 0.9500063605139295, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5778342485427856, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8649035692214966, + "num_tokens": 284831510.0, + "step": 7468 + }, + { + "epoch": 0.95013357079252, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.53937566280365, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8787535429000854, + "num_tokens": 284869309.0, + "step": 7469 + }, + { + "epoch": 0.9502607810711106, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5724048614501953, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8549840450286865, + "num_tokens": 284912395.0, + "step": 7470 + }, + { + "epoch": 0.9503879913497011, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5702449083328247, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8680751323699951, + "num_tokens": 284949138.0, + "step": 7471 + }, + { + "epoch": 0.9505152016282916, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.571946144104004, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8790247440338135, + "num_tokens": 284985777.0, + "step": 7472 + }, + { + "epoch": 0.950642411906882, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6004447937011719, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8807925581932068, + "num_tokens": 285019564.0, + "step": 7473 + }, + { + "epoch": 0.9507696221854726, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5501044988632202, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8743396997451782, + "num_tokens": 285060194.0, + "step": 7474 + }, + { + "epoch": 0.9508968324640631, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7427549362182617, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8726588487625122, + "num_tokens": 285091096.0, + "step": 7475 + }, + { + "epoch": 0.9510240427426536, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7129086256027222, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8669865131378174, + "num_tokens": 285124599.0, + "step": 7476 + }, + { + "epoch": 0.9511512530212441, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7190463542938232, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.863049328327179, + "num_tokens": 285161922.0, + "step": 7477 + }, + { + "epoch": 0.9512784632998347, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.528298020362854, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8612295389175415, + "num_tokens": 285201191.0, + "step": 7478 + }, + { + "epoch": 0.9514056735784251, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.738487958908081, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.864713728427887, + "num_tokens": 285234574.0, + "step": 7479 + }, + { + "epoch": 0.9515328838570156, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.499332308769226, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.867146372795105, + "num_tokens": 285275601.0, + "step": 7480 + }, + { + "epoch": 0.9516600941356061, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5494757890701294, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8740612268447876, + "num_tokens": 285314210.0, + "step": 7481 + }, + { + "epoch": 0.9517873044141967, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5547363758087158, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8679367303848267, + "num_tokens": 285357690.0, + "step": 7482 + }, + { + "epoch": 0.9519145146927872, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6271734237670898, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8599350452423096, + "num_tokens": 285396080.0, + "step": 7483 + }, + { + "epoch": 0.9520417249713777, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4673445224761963, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8654017448425293, + "num_tokens": 285438938.0, + "step": 7484 + }, + { + "epoch": 0.9521689352499682, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6621865034103394, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8747543096542358, + "num_tokens": 285475668.0, + "step": 7485 + }, + { + "epoch": 0.9522961455285587, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4950193166732788, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8639235496520996, + "num_tokens": 285519747.0, + "step": 7486 + }, + { + "epoch": 0.9524233558071492, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.614062786102295, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.866177499294281, + "num_tokens": 285554171.0, + "step": 7487 + }, + { + "epoch": 0.9525505660857397, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6434139013290405, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8882372379302979, + "num_tokens": 285587165.0, + "step": 7488 + }, + { + "epoch": 0.9526777763643303, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5719290971755981, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8701075315475464, + "num_tokens": 285623366.0, + "step": 7489 + }, + { + "epoch": 0.9528049866429208, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6016032695770264, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8708779811859131, + "num_tokens": 285659305.0, + "step": 7490 + }, + { + "epoch": 0.9529321969215112, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6122136116027832, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8545995950698853, + "num_tokens": 285697471.0, + "step": 7491 + }, + { + "epoch": 0.9530594072001017, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5942775011062622, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8555643558502197, + "num_tokens": 285737611.0, + "step": 7492 + }, + { + "epoch": 0.9531866174786923, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6095490455627441, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8689772486686707, + "num_tokens": 285774757.0, + "step": 7493 + }, + { + "epoch": 0.9533138277572828, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6816825866699219, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.838782787322998, + "num_tokens": 285812915.0, + "step": 7494 + }, + { + "epoch": 0.9534410380358733, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 3.7513413429260254, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8587071895599365, + "num_tokens": 285849039.0, + "step": 7495 + }, + { + "epoch": 0.9535682483144639, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6392319202423096, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8737150430679321, + "num_tokens": 285883623.0, + "step": 7496 + }, + { + "epoch": 0.9536954585930543, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7039250135421753, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8550138473510742, + "num_tokens": 285918304.0, + "step": 7497 + }, + { + "epoch": 0.9538226688716448, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6501901149749756, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8612041473388672, + "num_tokens": 285957063.0, + "step": 7498 + }, + { + "epoch": 0.9539498791502353, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.571531891822815, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8785120248794556, + "num_tokens": 285995560.0, + "step": 7499 + }, + { + "epoch": 0.9540770894288259, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6632031202316284, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8641332983970642, + "num_tokens": 286031562.0, + "step": 7500 + }, + { + "epoch": 0.9542042997074164, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5034282207489014, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8704999089241028, + "num_tokens": 286074963.0, + "step": 7501 + }, + { + "epoch": 0.9543315099860069, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4563806056976318, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8745129108428955, + "num_tokens": 286119509.0, + "step": 7502 + }, + { + "epoch": 0.9544587202645973, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5187194347381592, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8748371601104736, + "num_tokens": 286155237.0, + "step": 7503 + }, + { + "epoch": 0.9545859305431879, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6616604328155518, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8570805191993713, + "num_tokens": 286191514.0, + "step": 7504 + }, + { + "epoch": 0.9547131408217784, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.528205156326294, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8619632720947266, + "num_tokens": 286229724.0, + "step": 7505 + }, + { + "epoch": 0.9548403511003689, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.455483078956604, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8780184984207153, + "num_tokens": 286269073.0, + "step": 7506 + }, + { + "epoch": 0.9549675613789594, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5969572067260742, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8697403073310852, + "num_tokens": 286301904.0, + "step": 7507 + }, + { + "epoch": 0.95509477165755, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6534899473190308, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8641233444213867, + "num_tokens": 286339685.0, + "step": 7508 + }, + { + "epoch": 0.9552219819361404, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7399643659591675, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.861735463142395, + "num_tokens": 286377009.0, + "step": 7509 + }, + { + "epoch": 0.9553491922147309, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8250904083251953, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8878434896469116, + "num_tokens": 286407261.0, + "step": 7510 + }, + { + "epoch": 0.9554764024933214, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5970971584320068, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8623522520065308, + "num_tokens": 286446894.0, + "step": 7511 + }, + { + "epoch": 0.955603612771912, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.62353515625, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.867484450340271, + "num_tokens": 286482494.0, + "step": 7512 + }, + { + "epoch": 0.9557308230505025, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7438838481903076, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8512241840362549, + "num_tokens": 286517891.0, + "step": 7513 + }, + { + "epoch": 0.955858033329093, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5971611738204956, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8511163592338562, + "num_tokens": 286560631.0, + "step": 7514 + }, + { + "epoch": 0.9559852436076836, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6193487644195557, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8675705194473267, + "num_tokens": 286599546.0, + "step": 7515 + }, + { + "epoch": 0.956112453886274, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5901780128479004, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8771645426750183, + "num_tokens": 286637803.0, + "step": 7516 + }, + { + "epoch": 0.9562396641648645, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5415951013565063, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.869597315788269, + "num_tokens": 286677811.0, + "step": 7517 + }, + { + "epoch": 0.956366874443455, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6985152959823608, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8792513012886047, + "num_tokens": 286714367.0, + "step": 7518 + }, + { + "epoch": 0.9564940847220456, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5727771520614624, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8719173073768616, + "num_tokens": 286753241.0, + "step": 7519 + }, + { + "epoch": 0.9566212950006361, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7603232860565186, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8722853660583496, + "num_tokens": 286784599.0, + "step": 7520 + }, + { + "epoch": 0.9567485052792266, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5589643716812134, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8787590265274048, + "num_tokens": 286823589.0, + "step": 7521 + }, + { + "epoch": 0.956875715557817, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.461740255355835, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8600384593009949, + "num_tokens": 286866878.0, + "step": 7522 + }, + { + "epoch": 0.9570029258364076, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4251327514648438, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8736512660980225, + "num_tokens": 286907451.0, + "step": 7523 + }, + { + "epoch": 0.9571301361149981, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5295928716659546, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8618711233139038, + "num_tokens": 286947142.0, + "step": 7524 + }, + { + "epoch": 0.9572573463935886, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7331422567367554, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8432336449623108, + "num_tokens": 286984916.0, + "step": 7525 + }, + { + "epoch": 0.9573845566721791, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5420849323272705, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8777288794517517, + "num_tokens": 287019251.0, + "step": 7526 + }, + { + "epoch": 0.9575117669507697, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7208013534545898, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8563980460166931, + "num_tokens": 287054092.0, + "step": 7527 + }, + { + "epoch": 0.9576389772293601, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6630916595458984, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8773044943809509, + "num_tokens": 287092520.0, + "step": 7528 + }, + { + "epoch": 0.9577661875079506, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8843920230865479, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8615939617156982, + "num_tokens": 287121081.0, + "step": 7529 + }, + { + "epoch": 0.9578933977865411, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6179039478302002, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8878296613693237, + "num_tokens": 287154136.0, + "step": 7530 + }, + { + "epoch": 0.9580206080651317, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.745450735092163, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8844267129898071, + "num_tokens": 287183952.0, + "step": 7531 + }, + { + "epoch": 0.9581478183437222, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5727922916412354, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8732417225837708, + "num_tokens": 287219687.0, + "step": 7532 + }, + { + "epoch": 0.9582750286223127, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.766808271408081, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8784443736076355, + "num_tokens": 287251689.0, + "step": 7533 + }, + { + "epoch": 0.9584022389009031, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8424549102783203, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8634843826293945, + "num_tokens": 287282890.0, + "step": 7534 + }, + { + "epoch": 0.9585294491794937, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6044001579284668, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8639125227928162, + "num_tokens": 287319284.0, + "step": 7535 + }, + { + "epoch": 0.9586566594580842, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6412256956100464, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8591209053993225, + "num_tokens": 287360443.0, + "step": 7536 + }, + { + "epoch": 0.9587838697366747, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6560755968093872, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8699890375137329, + "num_tokens": 287398251.0, + "step": 7537 + }, + { + "epoch": 0.9589110800152653, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5124942064285278, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8717229962348938, + "num_tokens": 287438108.0, + "step": 7538 + }, + { + "epoch": 0.9590382902938558, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.586098313331604, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.87367182970047, + "num_tokens": 287473649.0, + "step": 7539 + }, + { + "epoch": 0.9591655005724462, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5242791175842285, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8780515193939209, + "num_tokens": 287515795.0, + "step": 7540 + }, + { + "epoch": 0.9592927108510367, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.561384916305542, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8643368482589722, + "num_tokens": 287556653.0, + "step": 7541 + }, + { + "epoch": 0.9594199211296273, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6577227115631104, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8586376905441284, + "num_tokens": 287592869.0, + "step": 7542 + }, + { + "epoch": 0.9595471314082178, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5323715209960938, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8555046319961548, + "num_tokens": 287632999.0, + "step": 7543 + }, + { + "epoch": 0.9596743416868083, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4787057638168335, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8476777672767639, + "num_tokens": 287677692.0, + "step": 7544 + }, + { + "epoch": 0.9598015519653988, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7586861848831177, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8608315587043762, + "num_tokens": 287712766.0, + "step": 7545 + }, + { + "epoch": 0.9599287622439893, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5026463270187378, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8706371784210205, + "num_tokens": 287753694.0, + "step": 7546 + }, + { + "epoch": 0.9600559725225798, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6494107246398926, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8624769449234009, + "num_tokens": 287787550.0, + "step": 7547 + }, + { + "epoch": 0.9601831828011703, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5281773805618286, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8671349287033081, + "num_tokens": 287826610.0, + "step": 7548 + }, + { + "epoch": 0.9603103930797608, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.82862389087677, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8606929779052734, + "num_tokens": 287861066.0, + "step": 7549 + }, + { + "epoch": 0.9604376033583514, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4969414472579956, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8653826713562012, + "num_tokens": 287903948.0, + "step": 7550 + }, + { + "epoch": 0.9605648136369419, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4985084533691406, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.880297064781189, + "num_tokens": 287943899.0, + "step": 7551 + }, + { + "epoch": 0.9606920239155323, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6779379844665527, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.867490291595459, + "num_tokens": 287979979.0, + "step": 7552 + }, + { + "epoch": 0.9608192341941229, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4836968183517456, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8787167072296143, + "num_tokens": 288018670.0, + "step": 7553 + }, + { + "epoch": 0.9609464444727134, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7058848142623901, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8727717995643616, + "num_tokens": 288056083.0, + "step": 7554 + }, + { + "epoch": 0.9610736547513039, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6236594915390015, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8709994554519653, + "num_tokens": 288093054.0, + "step": 7555 + }, + { + "epoch": 0.9612008650298944, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5550286769866943, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8622748851776123, + "num_tokens": 288135526.0, + "step": 7556 + }, + { + "epoch": 0.961328075308485, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.534268856048584, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8629904985427856, + "num_tokens": 288178692.0, + "step": 7557 + }, + { + "epoch": 0.9614552855870754, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.684368371963501, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8631002902984619, + "num_tokens": 288211856.0, + "step": 7558 + }, + { + "epoch": 0.9615824958656659, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.418232798576355, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8736449480056763, + "num_tokens": 288255522.0, + "step": 7559 + }, + { + "epoch": 0.9617097061442564, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5570374727249146, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8725477457046509, + "num_tokens": 288291851.0, + "step": 7560 + }, + { + "epoch": 0.961836916422847, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.652345061302185, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8602744340896606, + "num_tokens": 288329360.0, + "step": 7561 + }, + { + "epoch": 0.9619641267014375, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6053804159164429, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8580484390258789, + "num_tokens": 288365059.0, + "step": 7562 + }, + { + "epoch": 0.962091336980028, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5482622385025024, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8723434209823608, + "num_tokens": 288404936.0, + "step": 7563 + }, + { + "epoch": 0.9622185472586186, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5640592575073242, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8577347993850708, + "num_tokens": 288443194.0, + "step": 7564 + }, + { + "epoch": 0.962345757537209, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.618336796760559, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8585345149040222, + "num_tokens": 288480857.0, + "step": 7565 + }, + { + "epoch": 0.9624729678157995, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4380441904067993, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8760915994644165, + "num_tokens": 288522077.0, + "step": 7566 + }, + { + "epoch": 0.96260017809439, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.573806881904602, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8729235529899597, + "num_tokens": 288559111.0, + "step": 7567 + }, + { + "epoch": 0.9627273883729806, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6619365215301514, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.872451663017273, + "num_tokens": 288594150.0, + "step": 7568 + }, + { + "epoch": 0.9628545986515711, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.4743849039077759, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8652947545051575, + "num_tokens": 288640632.0, + "step": 7569 + }, + { + "epoch": 0.9629818089301616, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.7001068592071533, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.865788459777832, + "num_tokens": 288675014.0, + "step": 7570 + }, + { + "epoch": 0.963109019208752, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5004584789276123, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8571314811706543, + "num_tokens": 288717107.0, + "step": 7571 + }, + { + "epoch": 0.9632362294873426, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.5645502805709839, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8732348680496216, + "num_tokens": 288757520.0, + "step": 7572 + }, + { + "epoch": 0.9633634397659331, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6371736526489258, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8660495281219482, + "num_tokens": 288798804.0, + "step": 7573 + }, + { + "epoch": 0.9634906500445236, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.6590675115585327, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8529859781265259, + "num_tokens": 288834697.0, + "step": 7574 + }, + { + "epoch": 0.9636178603231141, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5446680784225464, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8606579303741455, + "num_tokens": 288873857.0, + "step": 7575 + }, + { + "epoch": 0.9637450706017047, + "ewc_loss": 1.9073486328125e-05, + "grad_norm": 1.568418025970459, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8681359887123108, + "num_tokens": 288910350.0, + "step": 7576 + }, + { + "epoch": 0.9638722808802951, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6862441301345825, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8642464876174927, + "num_tokens": 288945395.0, + "step": 7577 + }, + { + "epoch": 0.9639994911588856, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.3811700344085693, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8769701719284058, + "num_tokens": 288989122.0, + "step": 7578 + }, + { + "epoch": 0.9641267014374761, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5016605854034424, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8699156045913696, + "num_tokens": 289027297.0, + "step": 7579 + }, + { + "epoch": 0.9642539117160667, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5749220848083496, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8758624792098999, + "num_tokens": 289063547.0, + "step": 7580 + }, + { + "epoch": 0.9643811219946572, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5466078519821167, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8697443604469299, + "num_tokens": 289099392.0, + "step": 7581 + }, + { + "epoch": 0.9645083322732477, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7085096836090088, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8642634749412537, + "num_tokens": 289131990.0, + "step": 7582 + }, + { + "epoch": 0.9646355425518381, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4823271036148071, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8652750253677368, + "num_tokens": 289175614.0, + "step": 7583 + }, + { + "epoch": 0.9647627528304287, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5793774127960205, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8683937788009644, + "num_tokens": 289216454.0, + "step": 7584 + }, + { + "epoch": 0.9648899631090192, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.587064504623413, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8764251470565796, + "num_tokens": 289248990.0, + "step": 7585 + }, + { + "epoch": 0.9650171733876097, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5661388635635376, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8662174940109253, + "num_tokens": 289287104.0, + "step": 7586 + }, + { + "epoch": 0.9651443836662003, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5766475200653076, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8599124550819397, + "num_tokens": 289328306.0, + "step": 7587 + }, + { + "epoch": 0.9652715939447908, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6633952856063843, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8680194616317749, + "num_tokens": 289364396.0, + "step": 7588 + }, + { + "epoch": 0.9653988042233812, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5734227895736694, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8607027530670166, + "num_tokens": 289402040.0, + "step": 7589 + }, + { + "epoch": 0.9655260145019717, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7342004776000977, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8632043600082397, + "num_tokens": 289435346.0, + "step": 7590 + }, + { + "epoch": 0.9656532247805623, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6772278547286987, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8577715158462524, + "num_tokens": 289474848.0, + "step": 7591 + }, + { + "epoch": 0.9657804350591528, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4826362133026123, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8683862090110779, + "num_tokens": 289516052.0, + "step": 7592 + }, + { + "epoch": 0.9659076453377433, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.595751404762268, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8603973984718323, + "num_tokens": 289554794.0, + "step": 7593 + }, + { + "epoch": 0.9660348556163338, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5462698936462402, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8634854555130005, + "num_tokens": 289597733.0, + "step": 7594 + }, + { + "epoch": 0.9661620658949243, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.574251413345337, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8603276610374451, + "num_tokens": 289639604.0, + "step": 7595 + }, + { + "epoch": 0.9662892761735148, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.594566822052002, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8727312088012695, + "num_tokens": 289674358.0, + "step": 7596 + }, + { + "epoch": 0.9664164864521053, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4371867179870605, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8865296840667725, + "num_tokens": 289714577.0, + "step": 7597 + }, + { + "epoch": 0.9665436967306958, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6820507049560547, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8649729490280151, + "num_tokens": 289751662.0, + "step": 7598 + }, + { + "epoch": 0.9666709070092864, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4497183561325073, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8798119425773621, + "num_tokens": 289791775.0, + "step": 7599 + }, + { + "epoch": 0.9667981172878769, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5724642276763916, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8558154702186584, + "num_tokens": 289831764.0, + "step": 7600 + }, + { + "epoch": 0.9669253275664673, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4849498271942139, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8687102198600769, + "num_tokens": 289872920.0, + "step": 7601 + }, + { + "epoch": 0.9670525378450578, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5801938772201538, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8542805910110474, + "num_tokens": 289913932.0, + "step": 7602 + }, + { + "epoch": 0.9671797481236484, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4675120115280151, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8662760257720947, + "num_tokens": 289956819.0, + "step": 7603 + }, + { + "epoch": 0.9673069584022389, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.9666074514389038, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8572181463241577, + "num_tokens": 289995274.0, + "step": 7604 + }, + { + "epoch": 0.9674341686808294, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5787937641143799, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8548061847686768, + "num_tokens": 290036469.0, + "step": 7605 + }, + { + "epoch": 0.96756137895942, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6006299257278442, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8478977680206299, + "num_tokens": 290078696.0, + "step": 7606 + }, + { + "epoch": 0.9676885892380104, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6217557191848755, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8525146245956421, + "num_tokens": 290117085.0, + "step": 7607 + }, + { + "epoch": 0.9678157995166009, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.419415831565857, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.863636314868927, + "num_tokens": 290162075.0, + "step": 7608 + }, + { + "epoch": 0.9679430097951914, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6618127822875977, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8642208576202393, + "num_tokens": 290196855.0, + "step": 7609 + }, + { + "epoch": 0.968070220073782, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.506089687347412, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8605911731719971, + "num_tokens": 290241875.0, + "step": 7610 + }, + { + "epoch": 0.9681974303523725, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5216190814971924, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8832995891571045, + "num_tokens": 290279815.0, + "step": 7611 + }, + { + "epoch": 0.968324640630963, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4707125425338745, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8648790121078491, + "num_tokens": 290321357.0, + "step": 7612 + }, + { + "epoch": 0.9684518509095535, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6194007396697998, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8592963218688965, + "num_tokens": 290361281.0, + "step": 7613 + }, + { + "epoch": 0.968579061188144, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5426446199417114, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8612824082374573, + "num_tokens": 290401654.0, + "step": 7614 + }, + { + "epoch": 0.9687062714667345, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.678120732307434, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8592880368232727, + "num_tokens": 290436272.0, + "step": 7615 + }, + { + "epoch": 0.968833481745325, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5171252489089966, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8654744625091553, + "num_tokens": 290476627.0, + "step": 7616 + }, + { + "epoch": 0.9689606920239155, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5847320556640625, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8568687438964844, + "num_tokens": 290519591.0, + "step": 7617 + }, + { + "epoch": 0.9690879023025061, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5771825313568115, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8562225103378296, + "num_tokens": 290559087.0, + "step": 7618 + }, + { + "epoch": 0.9692151125810966, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5920493602752686, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8558236360549927, + "num_tokens": 290597033.0, + "step": 7619 + }, + { + "epoch": 0.969342322859687, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6126726865768433, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8516401052474976, + "num_tokens": 290636147.0, + "step": 7620 + }, + { + "epoch": 0.9694695331382776, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5787705183029175, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.869133472442627, + "num_tokens": 290678056.0, + "step": 7621 + }, + { + "epoch": 0.9695967434168681, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8149213790893555, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8682190179824829, + "num_tokens": 290708241.0, + "step": 7622 + }, + { + "epoch": 0.9697239536954586, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5983693599700928, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8769795894622803, + "num_tokens": 290744287.0, + "step": 7623 + }, + { + "epoch": 0.9698511639740491, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5819810628890991, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8660595417022705, + "num_tokens": 290783047.0, + "step": 7624 + }, + { + "epoch": 0.9699783742526397, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4848788976669312, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8817459344863892, + "num_tokens": 290823598.0, + "step": 7625 + }, + { + "epoch": 0.9701055845312301, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6793181896209717, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8719820380210876, + "num_tokens": 290859823.0, + "step": 7626 + }, + { + "epoch": 0.9702327948098206, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.526358962059021, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8499057292938232, + "num_tokens": 290901595.0, + "step": 7627 + }, + { + "epoch": 0.9703600050884111, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5853173732757568, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8509412407875061, + "num_tokens": 290940575.0, + "step": 7628 + }, + { + "epoch": 0.9704872153670017, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6409376859664917, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.87455153465271, + "num_tokens": 290976818.0, + "step": 7629 + }, + { + "epoch": 0.9706144256455922, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.617539405822754, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8730826377868652, + "num_tokens": 291014296.0, + "step": 7630 + }, + { + "epoch": 0.9707416359241827, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.805993676185608, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8731094598770142, + "num_tokens": 291051449.0, + "step": 7631 + }, + { + "epoch": 0.9708688462027731, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6833250522613525, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8805528879165649, + "num_tokens": 291086803.0, + "step": 7632 + }, + { + "epoch": 0.9709960564813637, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5744961500167847, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8510100245475769, + "num_tokens": 291125831.0, + "step": 7633 + }, + { + "epoch": 0.9711232667599542, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 3.816317558288574, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8475204706192017, + "num_tokens": 291158692.0, + "step": 7634 + }, + { + "epoch": 0.9712504770385447, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.522403359413147, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8620907068252563, + "num_tokens": 291199276.0, + "step": 7635 + }, + { + "epoch": 0.9713776873171353, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7323315143585205, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8560805320739746, + "num_tokens": 291233854.0, + "step": 7636 + }, + { + "epoch": 0.9715048975957258, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8077278137207031, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8544713258743286, + "num_tokens": 291267666.0, + "step": 7637 + }, + { + "epoch": 0.9716321078743162, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.581563115119934, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8827462792396545, + "num_tokens": 291306697.0, + "step": 7638 + }, + { + "epoch": 0.9717593181529067, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7615036964416504, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8674720525741577, + "num_tokens": 291339974.0, + "step": 7639 + }, + { + "epoch": 0.9718865284314973, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7186188697814941, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8513984084129333, + "num_tokens": 291373075.0, + "step": 7640 + }, + { + "epoch": 0.9720137387100878, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6665273904800415, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8649042844772339, + "num_tokens": 291413467.0, + "step": 7641 + }, + { + "epoch": 0.9721409489886783, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6729199886322021, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8676629066467285, + "num_tokens": 291448722.0, + "step": 7642 + }, + { + "epoch": 0.9722681592672688, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5497926473617554, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8726542592048645, + "num_tokens": 291486715.0, + "step": 7643 + }, + { + "epoch": 0.9723953695458593, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6743515729904175, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8697668313980103, + "num_tokens": 291522351.0, + "step": 7644 + }, + { + "epoch": 0.9725225798244498, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6298644542694092, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8626395463943481, + "num_tokens": 291558622.0, + "step": 7645 + }, + { + "epoch": 0.9726497901030403, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6187797784805298, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8686079978942871, + "num_tokens": 291598355.0, + "step": 7646 + }, + { + "epoch": 0.9727770003816308, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6570971012115479, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8631148934364319, + "num_tokens": 291635573.0, + "step": 7647 + }, + { + "epoch": 0.9729042106602214, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5685648918151855, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8651976585388184, + "num_tokens": 291672838.0, + "step": 7648 + }, + { + "epoch": 0.9730314209388119, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5063577890396118, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8632959127426147, + "num_tokens": 291710884.0, + "step": 7649 + }, + { + "epoch": 0.9731586312174023, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6670745611190796, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8711705207824707, + "num_tokens": 291748498.0, + "step": 7650 + }, + { + "epoch": 0.9732858414959928, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.584923505783081, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8700963258743286, + "num_tokens": 291785618.0, + "step": 7651 + }, + { + "epoch": 0.9734130517745834, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.46319580078125, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8679590225219727, + "num_tokens": 291826704.0, + "step": 7652 + }, + { + "epoch": 0.9735402620531739, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6308107376098633, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8635444641113281, + "num_tokens": 291862293.0, + "step": 7653 + }, + { + "epoch": 0.9736674723317644, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6162967681884766, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8590051531791687, + "num_tokens": 291899579.0, + "step": 7654 + }, + { + "epoch": 0.973794682610355, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6670950651168823, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8759702444076538, + "num_tokens": 291933801.0, + "step": 7655 + }, + { + "epoch": 0.9739218928889454, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6703166961669922, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8655004501342773, + "num_tokens": 291973427.0, + "step": 7656 + }, + { + "epoch": 0.9740491031675359, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6567176580429077, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8747142553329468, + "num_tokens": 292006856.0, + "step": 7657 + }, + { + "epoch": 0.9741763134461264, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6244125366210938, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8603224754333496, + "num_tokens": 292045243.0, + "step": 7658 + }, + { + "epoch": 0.974303523724717, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.516288161277771, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8706326484680176, + "num_tokens": 292089962.0, + "step": 7659 + }, + { + "epoch": 0.9744307340033075, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.543825626373291, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8635205030441284, + "num_tokens": 292131722.0, + "step": 7660 + }, + { + "epoch": 0.974557944281898, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.598362684249878, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.85155189037323, + "num_tokens": 292176648.0, + "step": 7661 + }, + { + "epoch": 0.9746851545604885, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6127568483352661, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8639919757843018, + "num_tokens": 292216843.0, + "step": 7662 + }, + { + "epoch": 0.974812364839079, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6929835081100464, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8618283271789551, + "num_tokens": 292255830.0, + "step": 7663 + }, + { + "epoch": 0.9749395751176695, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.517025351524353, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8674169778823853, + "num_tokens": 292296221.0, + "step": 7664 + }, + { + "epoch": 0.97506678539626, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.579892635345459, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.845680296421051, + "num_tokens": 292339307.0, + "step": 7665 + }, + { + "epoch": 0.9751939956748505, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.3411026000976562, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8903660774230957, + "num_tokens": 292384390.0, + "step": 7666 + }, + { + "epoch": 0.9753212059534411, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6510127782821655, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8847234845161438, + "num_tokens": 292419251.0, + "step": 7667 + }, + { + "epoch": 0.9754484162320316, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6486657857894897, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8731865882873535, + "num_tokens": 292452003.0, + "step": 7668 + }, + { + "epoch": 0.975575626510622, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5957764387130737, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8692038059234619, + "num_tokens": 292492903.0, + "step": 7669 + }, + { + "epoch": 0.9757028367892125, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5303541421890259, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8801097273826599, + "num_tokens": 292535589.0, + "step": 7670 + }, + { + "epoch": 0.9758300470678031, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.721517562866211, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.847061276435852, + "num_tokens": 292571152.0, + "step": 7671 + }, + { + "epoch": 0.9759572573463936, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5686310529708862, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8767167925834656, + "num_tokens": 292606944.0, + "step": 7672 + }, + { + "epoch": 0.9760844676249841, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6921112537384033, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8501346707344055, + "num_tokens": 292647370.0, + "step": 7673 + }, + { + "epoch": 0.9762116779035747, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6146150827407837, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8794130682945251, + "num_tokens": 292685336.0, + "step": 7674 + }, + { + "epoch": 0.9763388881821651, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6583818197250366, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8605218529701233, + "num_tokens": 292726974.0, + "step": 7675 + }, + { + "epoch": 0.9764660984607556, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6767793893814087, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.863689661026001, + "num_tokens": 292762586.0, + "step": 7676 + }, + { + "epoch": 0.9765933087393461, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.662264108657837, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.867077112197876, + "num_tokens": 292796050.0, + "step": 7677 + }, + { + "epoch": 0.9767205190179367, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.539986491203308, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8803259134292603, + "num_tokens": 292830126.0, + "step": 7678 + }, + { + "epoch": 0.9768477292965272, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6801905632019043, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.868480920791626, + "num_tokens": 292867650.0, + "step": 7679 + }, + { + "epoch": 0.9769749395751177, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6052366495132446, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8663374781608582, + "num_tokens": 292906445.0, + "step": 7680 + }, + { + "epoch": 0.9771021498537081, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5768029689788818, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8691798448562622, + "num_tokens": 292945749.0, + "step": 7681 + }, + { + "epoch": 0.9772293601322987, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.508304238319397, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8839569687843323, + "num_tokens": 292983407.0, + "step": 7682 + }, + { + "epoch": 0.9773565704108892, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4667586088180542, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8692541122436523, + "num_tokens": 293023362.0, + "step": 7683 + }, + { + "epoch": 0.9774837806894797, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7293128967285156, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8686144948005676, + "num_tokens": 293056253.0, + "step": 7684 + }, + { + "epoch": 0.9776109909680702, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5397322177886963, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8650317788124084, + "num_tokens": 293097394.0, + "step": 7685 + }, + { + "epoch": 0.9777382012466608, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.657044768333435, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8552245497703552, + "num_tokens": 293136875.0, + "step": 7686 + }, + { + "epoch": 0.9778654115252512, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5311477184295654, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8481431007385254, + "num_tokens": 293180326.0, + "step": 7687 + }, + { + "epoch": 0.9779926218038417, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.646578311920166, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8532489538192749, + "num_tokens": 293218938.0, + "step": 7688 + }, + { + "epoch": 0.9781198320824323, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5398082733154297, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8660395741462708, + "num_tokens": 293257397.0, + "step": 7689 + }, + { + "epoch": 0.9782470423610228, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6082497835159302, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8515753746032715, + "num_tokens": 293293541.0, + "step": 7690 + }, + { + "epoch": 0.9783742526396133, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5111289024353027, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8669404983520508, + "num_tokens": 293334650.0, + "step": 7691 + }, + { + "epoch": 0.9785014629182038, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.642309546470642, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8616660833358765, + "num_tokens": 293372217.0, + "step": 7692 + }, + { + "epoch": 0.9786286731967943, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8046348094940186, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8462977409362793, + "num_tokens": 293407452.0, + "step": 7693 + }, + { + "epoch": 0.9787558834753848, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4877735376358032, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8568060994148254, + "num_tokens": 293451751.0, + "step": 7694 + }, + { + "epoch": 0.9788830937539753, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5022281408309937, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8511620759963989, + "num_tokens": 293497223.0, + "step": 7695 + }, + { + "epoch": 0.9790103040325658, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6241716146469116, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.868992805480957, + "num_tokens": 293531448.0, + "step": 7696 + }, + { + "epoch": 0.9791375143111564, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.485657811164856, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8784423470497131, + "num_tokens": 293568216.0, + "step": 7697 + }, + { + "epoch": 0.9792647245897469, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5135552883148193, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8615257740020752, + "num_tokens": 293612578.0, + "step": 7698 + }, + { + "epoch": 0.9793919348683373, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5349222421646118, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8630908131599426, + "num_tokens": 293653839.0, + "step": 7699 + }, + { + "epoch": 0.9795191451469278, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8187671899795532, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.87360680103302, + "num_tokens": 293680651.0, + "step": 7700 + }, + { + "epoch": 0.9796463554255184, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7017611265182495, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8611061573028564, + "num_tokens": 293713823.0, + "step": 7701 + }, + { + "epoch": 0.9797735657041089, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5630521774291992, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8642880916595459, + "num_tokens": 293752696.0, + "step": 7702 + }, + { + "epoch": 0.9799007759826994, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.674544334411621, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8563823103904724, + "num_tokens": 293790984.0, + "step": 7703 + }, + { + "epoch": 0.98002798626129, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.525075078010559, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8747114539146423, + "num_tokens": 293827770.0, + "step": 7704 + }, + { + "epoch": 0.9801551965398804, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5490753650665283, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8528833389282227, + "num_tokens": 293867230.0, + "step": 7705 + }, + { + "epoch": 0.9802824068184709, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6230840682983398, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.867894172668457, + "num_tokens": 293902990.0, + "step": 7706 + }, + { + "epoch": 0.9804096170970614, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6094970703125, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8692587614059448, + "num_tokens": 293941826.0, + "step": 7707 + }, + { + "epoch": 0.980536827375652, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6467055082321167, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8529324531555176, + "num_tokens": 293980084.0, + "step": 7708 + }, + { + "epoch": 0.9806640376542425, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4643006324768066, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8704249858856201, + "num_tokens": 294021577.0, + "step": 7709 + }, + { + "epoch": 0.980791247932833, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5011967420578003, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8774302005767822, + "num_tokens": 294058968.0, + "step": 7710 + }, + { + "epoch": 0.9809184582114235, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5800265073776245, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8705434799194336, + "num_tokens": 294100819.0, + "step": 7711 + }, + { + "epoch": 0.981045668490014, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4704045057296753, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8774683475494385, + "num_tokens": 294137805.0, + "step": 7712 + }, + { + "epoch": 0.9811728787686045, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.728540062904358, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8760581016540527, + "num_tokens": 294169732.0, + "step": 7713 + }, + { + "epoch": 0.981300089047195, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.3724086284637451, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8686481714248657, + "num_tokens": 294216011.0, + "step": 7714 + }, + { + "epoch": 0.9814272993257855, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4981582164764404, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8679090142250061, + "num_tokens": 294253992.0, + "step": 7715 + }, + { + "epoch": 0.9815545096043761, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.647390365600586, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8711850047111511, + "num_tokens": 294288739.0, + "step": 7716 + }, + { + "epoch": 0.9816817198829666, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4387112855911255, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8651162385940552, + "num_tokens": 294331620.0, + "step": 7717 + }, + { + "epoch": 0.981808930161557, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6018505096435547, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8491190075874329, + "num_tokens": 294378045.0, + "step": 7718 + }, + { + "epoch": 0.9819361404401475, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5399855375289917, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8818293809890747, + "num_tokens": 294413934.0, + "step": 7719 + }, + { + "epoch": 0.9820633507187381, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5779643058776855, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8725937008857727, + "num_tokens": 294451563.0, + "step": 7720 + }, + { + "epoch": 0.9821905609973286, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.508182168006897, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8589106798171997, + "num_tokens": 294493009.0, + "step": 7721 + }, + { + "epoch": 0.9823177712759191, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8038487434387207, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8613771200180054, + "num_tokens": 294522064.0, + "step": 7722 + }, + { + "epoch": 0.9824449815545097, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4969481229782104, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8550145030021667, + "num_tokens": 294568824.0, + "step": 7723 + }, + { + "epoch": 0.9825721918331001, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.513388991355896, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8699892163276672, + "num_tokens": 294612945.0, + "step": 7724 + }, + { + "epoch": 0.9826994021116906, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6203973293304443, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8460495471954346, + "num_tokens": 294655192.0, + "step": 7725 + }, + { + "epoch": 0.9828266123902811, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6875004768371582, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8690218925476074, + "num_tokens": 294692789.0, + "step": 7726 + }, + { + "epoch": 0.9829538226688717, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.508207082748413, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8819449543952942, + "num_tokens": 294729903.0, + "step": 7727 + }, + { + "epoch": 0.9830810329474622, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5850334167480469, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8729037046432495, + "num_tokens": 294770347.0, + "step": 7728 + }, + { + "epoch": 0.9832082432260527, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6017417907714844, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8476037979125977, + "num_tokens": 294810547.0, + "step": 7729 + }, + { + "epoch": 0.9833354535046431, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6141031980514526, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8532162308692932, + "num_tokens": 294848852.0, + "step": 7730 + }, + { + "epoch": 0.9834626637832337, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.9750745296478271, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8599537014961243, + "num_tokens": 294883301.0, + "step": 7731 + }, + { + "epoch": 0.9835898740618242, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7684988975524902, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8722825050354004, + "num_tokens": 294917349.0, + "step": 7732 + }, + { + "epoch": 0.9837170843404147, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5564717054367065, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.880920946598053, + "num_tokens": 294953586.0, + "step": 7733 + }, + { + "epoch": 0.9838442946190052, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.578094482421875, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8544732332229614, + "num_tokens": 294992922.0, + "step": 7734 + }, + { + "epoch": 0.9839715048975958, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.623355746269226, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8734186291694641, + "num_tokens": 295024091.0, + "step": 7735 + }, + { + "epoch": 0.9840987151761862, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4451849460601807, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8678184747695923, + "num_tokens": 295070288.0, + "step": 7736 + }, + { + "epoch": 0.9842259254547767, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5585579872131348, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8512777090072632, + "num_tokens": 295111750.0, + "step": 7737 + }, + { + "epoch": 0.9843531357333672, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7080073356628418, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8608753681182861, + "num_tokens": 295145118.0, + "step": 7738 + }, + { + "epoch": 0.9844803460119578, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.592752456665039, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8736971616744995, + "num_tokens": 295183514.0, + "step": 7739 + }, + { + "epoch": 0.9846075562905483, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6487072706222534, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8721394538879395, + "num_tokens": 295224701.0, + "step": 7740 + }, + { + "epoch": 0.9847347665691388, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.537299633026123, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8701722025871277, + "num_tokens": 295262906.0, + "step": 7741 + }, + { + "epoch": 0.9848619768477292, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6569876670837402, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8651074171066284, + "num_tokens": 295296707.0, + "step": 7742 + }, + { + "epoch": 0.9849891871263198, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6438140869140625, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8633146286010742, + "num_tokens": 295329891.0, + "step": 7743 + }, + { + "epoch": 0.9851163974049103, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6888600587844849, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8670154809951782, + "num_tokens": 295367093.0, + "step": 7744 + }, + { + "epoch": 0.9852436076835008, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7519323825836182, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8709196448326111, + "num_tokens": 295406592.0, + "step": 7745 + }, + { + "epoch": 0.9853708179620914, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6022366285324097, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8677123785018921, + "num_tokens": 295442796.0, + "step": 7746 + }, + { + "epoch": 0.9854980282406819, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6415059566497803, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8589345216751099, + "num_tokens": 295480960.0, + "step": 7747 + }, + { + "epoch": 0.9856252385192723, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5467242002487183, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8730422258377075, + "num_tokens": 295518164.0, + "step": 7748 + }, + { + "epoch": 0.9857524487978628, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6431595087051392, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8790313601493835, + "num_tokens": 295553432.0, + "step": 7749 + }, + { + "epoch": 0.9858796590764534, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.674048662185669, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8689508438110352, + "num_tokens": 295587331.0, + "step": 7750 + }, + { + "epoch": 0.9860068693550439, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4891529083251953, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8642974495887756, + "num_tokens": 295630146.0, + "step": 7751 + }, + { + "epoch": 0.9861340796336344, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5476182699203491, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8679845929145813, + "num_tokens": 295669992.0, + "step": 7752 + }, + { + "epoch": 0.986261289912225, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.548744559288025, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.872599184513092, + "num_tokens": 295712427.0, + "step": 7753 + }, + { + "epoch": 0.9863885001908154, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.8999221324920654, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8687770962715149, + "num_tokens": 295744311.0, + "step": 7754 + }, + { + "epoch": 0.9865157104694059, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6829723119735718, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8833655714988708, + "num_tokens": 295777877.0, + "step": 7755 + }, + { + "epoch": 0.9866429207479964, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.647071123123169, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8746979832649231, + "num_tokens": 295812523.0, + "step": 7756 + }, + { + "epoch": 0.986770131026587, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.559240698814392, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8603445291519165, + "num_tokens": 295854880.0, + "step": 7757 + }, + { + "epoch": 0.9868973413051775, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5194907188415527, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8609598875045776, + "num_tokens": 295900793.0, + "step": 7758 + }, + { + "epoch": 0.987024551583768, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.655077576637268, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8419481515884399, + "num_tokens": 295942799.0, + "step": 7759 + }, + { + "epoch": 0.9871517618623584, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5825527906417847, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8642833828926086, + "num_tokens": 295984360.0, + "step": 7760 + }, + { + "epoch": 0.987278972140949, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4531062841415405, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.855675995349884, + "num_tokens": 296032187.0, + "step": 7761 + }, + { + "epoch": 0.9874061824195395, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.686566948890686, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8550338745117188, + "num_tokens": 296067811.0, + "step": 7762 + }, + { + "epoch": 0.98753339269813, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6243027448654175, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8476722240447998, + "num_tokens": 296103905.0, + "step": 7763 + }, + { + "epoch": 0.9876606029767205, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6292188167572021, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8589669466018677, + "num_tokens": 296140823.0, + "step": 7764 + }, + { + "epoch": 0.9877878132553111, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.435124158859253, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8742085695266724, + "num_tokens": 296181644.0, + "step": 7765 + }, + { + "epoch": 0.9879150235339016, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6009513139724731, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8763107061386108, + "num_tokens": 296219523.0, + "step": 7766 + }, + { + "epoch": 0.988042233812492, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6458094120025635, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8699098825454712, + "num_tokens": 296257934.0, + "step": 7767 + }, + { + "epoch": 0.9881694440910825, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6694352626800537, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8428521752357483, + "num_tokens": 296297539.0, + "step": 7768 + }, + { + "epoch": 0.9882966543696731, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.349745750427246, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8759933710098267, + "num_tokens": 296346047.0, + "step": 7769 + }, + { + "epoch": 0.9884238646482636, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5435843467712402, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8750630617141724, + "num_tokens": 296385852.0, + "step": 7770 + }, + { + "epoch": 0.9885510749268541, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5214072465896606, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8754066824913025, + "num_tokens": 296423968.0, + "step": 7771 + }, + { + "epoch": 0.9886782852054447, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.601935863494873, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8625017404556274, + "num_tokens": 296464110.0, + "step": 7772 + }, + { + "epoch": 0.9888054954840351, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.538647174835205, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8699218034744263, + "num_tokens": 296502523.0, + "step": 7773 + }, + { + "epoch": 0.9889327057626256, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5654586553573608, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8736526966094971, + "num_tokens": 296541080.0, + "step": 7774 + }, + { + "epoch": 0.9890599160412161, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5849788188934326, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8580851554870605, + "num_tokens": 296583713.0, + "step": 7775 + }, + { + "epoch": 0.9891871263198067, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6556590795516968, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.851495623588562, + "num_tokens": 296620487.0, + "step": 7776 + }, + { + "epoch": 0.9893143365983972, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.491500735282898, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8692055344581604, + "num_tokens": 296661154.0, + "step": 7777 + }, + { + "epoch": 0.9894415468769877, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5794503688812256, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8743982315063477, + "num_tokens": 296693926.0, + "step": 7778 + }, + { + "epoch": 0.9895687571555781, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.598762035369873, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8881298899650574, + "num_tokens": 296725504.0, + "step": 7779 + }, + { + "epoch": 0.9896959674341687, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5760626792907715, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8817410469055176, + "num_tokens": 296760698.0, + "step": 7780 + }, + { + "epoch": 0.9898231777127592, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5759077072143555, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8618050813674927, + "num_tokens": 296802076.0, + "step": 7781 + }, + { + "epoch": 0.9899503879913497, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7196234464645386, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8685392141342163, + "num_tokens": 296839012.0, + "step": 7782 + }, + { + "epoch": 0.9900775982699402, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7177339792251587, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8724179267883301, + "num_tokens": 296871114.0, + "step": 7783 + }, + { + "epoch": 0.9902048085485308, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6481387615203857, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8790711760520935, + "num_tokens": 296904247.0, + "step": 7784 + }, + { + "epoch": 0.9903320188271212, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5885024070739746, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8601362109184265, + "num_tokens": 296948181.0, + "step": 7785 + }, + { + "epoch": 0.9904592291057117, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5667554140090942, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8540606498718262, + "num_tokens": 296986976.0, + "step": 7786 + }, + { + "epoch": 0.9905864393843022, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5267318487167358, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8662445545196533, + "num_tokens": 297026547.0, + "step": 7787 + }, + { + "epoch": 0.9907136496628928, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4841644763946533, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8742282390594482, + "num_tokens": 297067268.0, + "step": 7788 + }, + { + "epoch": 0.9908408599414833, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5870572328567505, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8612564206123352, + "num_tokens": 297104920.0, + "step": 7789 + }, + { + "epoch": 0.9909680702200738, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5924863815307617, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8687843680381775, + "num_tokens": 297140936.0, + "step": 7790 + }, + { + "epoch": 0.9910952804986642, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5274890661239624, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8762527108192444, + "num_tokens": 297182754.0, + "step": 7791 + }, + { + "epoch": 0.9912224907772548, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6180700063705444, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8587936162948608, + "num_tokens": 297226243.0, + "step": 7792 + }, + { + "epoch": 0.9913497010558453, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6472980976104736, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8675425052642822, + "num_tokens": 297262018.0, + "step": 7793 + }, + { + "epoch": 0.9914769113344358, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6022385358810425, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8753911852836609, + "num_tokens": 297297958.0, + "step": 7794 + }, + { + "epoch": 0.9916041216130264, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6984949111938477, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.85899418592453, + "num_tokens": 297331892.0, + "step": 7795 + }, + { + "epoch": 0.9917313318916169, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4968832731246948, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.865195631980896, + "num_tokens": 297372970.0, + "step": 7796 + }, + { + "epoch": 0.9918585421702073, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.545427680015564, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.878415048122406, + "num_tokens": 297410298.0, + "step": 7797 + }, + { + "epoch": 0.9919857524487978, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.449438452720642, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8516093492507935, + "num_tokens": 297454830.0, + "step": 7798 + }, + { + "epoch": 0.9921129627273884, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.620025634765625, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8660326600074768, + "num_tokens": 297489309.0, + "step": 7799 + }, + { + "epoch": 0.9922401730059789, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5427976846694946, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8534910678863525, + "num_tokens": 297532940.0, + "step": 7800 + }, + { + "epoch": 0.9923673832845694, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6441420316696167, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8470159769058228, + "num_tokens": 297574956.0, + "step": 7801 + }, + { + "epoch": 0.9924945935631599, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5576190948486328, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8661993145942688, + "num_tokens": 297613744.0, + "step": 7802 + }, + { + "epoch": 0.9926218038417504, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.617796540260315, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8511489629745483, + "num_tokens": 297654142.0, + "step": 7803 + }, + { + "epoch": 0.9927490141203409, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.642849087715149, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8766974806785583, + "num_tokens": 297688784.0, + "step": 7804 + }, + { + "epoch": 0.9928762243989314, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5599573850631714, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8640641570091248, + "num_tokens": 297731139.0, + "step": 7805 + }, + { + "epoch": 0.993003434677522, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.549106240272522, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8724828958511353, + "num_tokens": 297774203.0, + "step": 7806 + }, + { + "epoch": 0.9931306449561125, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4119110107421875, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8728169202804565, + "num_tokens": 297817340.0, + "step": 7807 + }, + { + "epoch": 0.993257855234703, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5954734086990356, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.862847089767456, + "num_tokens": 297854890.0, + "step": 7808 + }, + { + "epoch": 0.9933850655132934, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6438860893249512, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8544389009475708, + "num_tokens": 297893477.0, + "step": 7809 + }, + { + "epoch": 0.993512275791884, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6517170667648315, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8687052726745605, + "num_tokens": 297927014.0, + "step": 7810 + }, + { + "epoch": 0.9936394860704745, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.681350827217102, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8557338118553162, + "num_tokens": 297962776.0, + "step": 7811 + }, + { + "epoch": 0.993766696349065, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.658681035041809, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8673344850540161, + "num_tokens": 298000279.0, + "step": 7812 + }, + { + "epoch": 0.9938939066276555, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5849765539169312, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8683136701583862, + "num_tokens": 298038811.0, + "step": 7813 + }, + { + "epoch": 0.9940211169062461, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5412957668304443, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.878899097442627, + "num_tokens": 298074500.0, + "step": 7814 + }, + { + "epoch": 0.9941483271848366, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6049352884292603, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8479944467544556, + "num_tokens": 298114156.0, + "step": 7815 + }, + { + "epoch": 0.994275537463427, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4814594984054565, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8633054494857788, + "num_tokens": 298156660.0, + "step": 7816 + }, + { + "epoch": 0.9944027477420175, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6470199823379517, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8454602956771851, + "num_tokens": 298195373.0, + "step": 7817 + }, + { + "epoch": 0.9945299580206081, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5428977012634277, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.860104501247406, + "num_tokens": 298233917.0, + "step": 7818 + }, + { + "epoch": 0.9946571682991986, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6608681678771973, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8533704280853271, + "num_tokens": 298268493.0, + "step": 7819 + }, + { + "epoch": 0.9947843785777891, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4857571125030518, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.866438627243042, + "num_tokens": 298310789.0, + "step": 7820 + }, + { + "epoch": 0.9949115888563796, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.488182783126831, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.858332633972168, + "num_tokens": 298354253.0, + "step": 7821 + }, + { + "epoch": 0.9950387991349701, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5515636205673218, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8679667711257935, + "num_tokens": 298395490.0, + "step": 7822 + }, + { + "epoch": 0.9951660094135606, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5136637687683105, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8723008036613464, + "num_tokens": 298436429.0, + "step": 7823 + }, + { + "epoch": 0.9952932196921511, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5038831233978271, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8603696823120117, + "num_tokens": 298478209.0, + "step": 7824 + }, + { + "epoch": 0.9954204299707416, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6189414262771606, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8627026677131653, + "num_tokens": 298520816.0, + "step": 7825 + }, + { + "epoch": 0.9955476402493322, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.542357325553894, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8521928787231445, + "num_tokens": 298562314.0, + "step": 7826 + }, + { + "epoch": 0.9956748505279227, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5644471645355225, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8553597927093506, + "num_tokens": 298604854.0, + "step": 7827 + }, + { + "epoch": 0.9958020608065131, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.652722954750061, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8627023100852966, + "num_tokens": 298641545.0, + "step": 7828 + }, + { + "epoch": 0.9959292710851037, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.575974464416504, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8530953526496887, + "num_tokens": 298682394.0, + "step": 7829 + }, + { + "epoch": 0.9960564813636942, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6465948820114136, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8551425933837891, + "num_tokens": 298719968.0, + "step": 7830 + }, + { + "epoch": 0.9961836916422847, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5511776208877563, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8677247762680054, + "num_tokens": 298756114.0, + "step": 7831 + }, + { + "epoch": 0.9963109019208752, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6190835237503052, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.843874454498291, + "num_tokens": 298792093.0, + "step": 7832 + }, + { + "epoch": 0.9964381121994658, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5363515615463257, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.863690972328186, + "num_tokens": 298831948.0, + "step": 7833 + }, + { + "epoch": 0.9965653224780562, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.622573971748352, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8691685795783997, + "num_tokens": 298868611.0, + "step": 7834 + }, + { + "epoch": 0.9966925327566467, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7645652294158936, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8598899841308594, + "num_tokens": 298898982.0, + "step": 7835 + }, + { + "epoch": 0.9968197430352372, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.554357647895813, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8711988925933838, + "num_tokens": 298940845.0, + "step": 7836 + }, + { + "epoch": 0.9969469533138278, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.701991319656372, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.867268443107605, + "num_tokens": 298973221.0, + "step": 7837 + }, + { + "epoch": 0.9970741635924183, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.7253081798553467, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8558825850486755, + "num_tokens": 299006994.0, + "step": 7838 + }, + { + "epoch": 0.9972013738710088, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5006566047668457, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8791664838790894, + "num_tokens": 299046473.0, + "step": 7839 + }, + { + "epoch": 0.9973285841495992, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5845699310302734, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8749758005142212, + "num_tokens": 299086613.0, + "step": 7840 + }, + { + "epoch": 0.9974557944281898, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.505987286567688, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8677148222923279, + "num_tokens": 299126525.0, + "step": 7841 + }, + { + "epoch": 0.9975830047067803, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6264982223510742, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8562767505645752, + "num_tokens": 299169346.0, + "step": 7842 + }, + { + "epoch": 0.9977102149853708, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6679476499557495, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8484034538269043, + "num_tokens": 299204638.0, + "step": 7843 + }, + { + "epoch": 0.9978374252639614, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4921457767486572, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8795802593231201, + "num_tokens": 299244148.0, + "step": 7844 + }, + { + "epoch": 0.9979646355425519, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5278403759002686, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8640832304954529, + "num_tokens": 299284732.0, + "step": 7845 + }, + { + "epoch": 0.9980918458211423, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4877068996429443, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.857255756855011, + "num_tokens": 299326159.0, + "step": 7846 + }, + { + "epoch": 0.9982190560997328, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5614697933197021, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8693481683731079, + "num_tokens": 299364305.0, + "step": 7847 + }, + { + "epoch": 0.9983462663783234, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6500533819198608, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8606390953063965, + "num_tokens": 299398701.0, + "step": 7848 + }, + { + "epoch": 0.9984734766569139, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5331066846847534, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8607147932052612, + "num_tokens": 299438370.0, + "step": 7849 + }, + { + "epoch": 0.9986006869355044, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5186048746109009, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.875893235206604, + "num_tokens": 299477728.0, + "step": 7850 + }, + { + "epoch": 0.9987278972140949, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.7075653076171875, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8577998280525208, + "num_tokens": 299510495.0, + "step": 7851 + }, + { + "epoch": 0.9988551074926854, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5569926500320435, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8754727840423584, + "num_tokens": 299546954.0, + "step": 7852 + }, + { + "epoch": 0.9989823177712759, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.4981814622879028, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8732747435569763, + "num_tokens": 299588597.0, + "step": 7853 + }, + { + "epoch": 0.9991095280498664, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6576015949249268, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8659786581993103, + "num_tokens": 299622206.0, + "step": 7854 + }, + { + "epoch": 0.9992367383284569, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.527833104133606, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8671520352363586, + "num_tokens": 299659660.0, + "step": 7855 + }, + { + "epoch": 0.9993639486070475, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.4799226522445679, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8791604042053223, + "num_tokens": 299699931.0, + "step": 7856 + }, + { + "epoch": 0.999491158885638, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.6194467544555664, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8752800226211548, + "num_tokens": 299734145.0, + "step": 7857 + }, + { + "epoch": 0.9996183691642284, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.5740063190460205, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8500180840492249, + "num_tokens": 299773673.0, + "step": 7858 + }, + { + "epoch": 0.9997455794428189, + "ewc_loss": 1.919269561767578e-05, + "grad_norm": 1.620341181755066, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8640166521072388, + "num_tokens": 299810272.0, + "step": 7859 + }, + { + "epoch": 0.9998727897214095, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5887460708618164, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8549431562423706, + "num_tokens": 299848639.0, + "step": 7860 + }, + { + "epoch": 1.0, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6000617742538452, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8689497709274292, + "num_tokens": 299886327.0, + "step": 7861 + }, + { + "epoch": 1.0001272102785905, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.3978348970413208, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8720795512199402, + "num_tokens": 299927606.0, + "step": 7862 + }, + { + "epoch": 1.000254420557181, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5713411569595337, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8697845339775085, + "num_tokens": 299966521.0, + "step": 7863 + }, + { + "epoch": 1.0003816308357716, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6643482446670532, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8755935430526733, + "num_tokens": 300007426.0, + "step": 7864 + }, + { + "epoch": 1.0005088411143621, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.562337040901184, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.870079517364502, + "num_tokens": 300043668.0, + "step": 7865 + }, + { + "epoch": 1.0006360513929526, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5543574094772339, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8678505420684814, + "num_tokens": 300081477.0, + "step": 7866 + }, + { + "epoch": 1.0007632616715432, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.45192551612854, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8782271146774292, + "num_tokens": 300122652.0, + "step": 7867 + }, + { + "epoch": 1.0008904719501335, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5290253162384033, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.865151584148407, + "num_tokens": 300165563.0, + "step": 7868 + }, + { + "epoch": 1.001017682228724, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.631003499031067, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8738635182380676, + "num_tokens": 300202282.0, + "step": 7869 + }, + { + "epoch": 1.0011448925073145, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 3.7721714973449707, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8723514676094055, + "num_tokens": 300245267.0, + "step": 7870 + }, + { + "epoch": 1.001272102785905, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6751843690872192, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8688547611236572, + "num_tokens": 300286912.0, + "step": 7871 + }, + { + "epoch": 1.0013993130644956, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6759966611862183, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8749182820320129, + "num_tokens": 300325061.0, + "step": 7872 + }, + { + "epoch": 1.0015265233430861, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6082117557525635, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8618309497833252, + "num_tokens": 300363667.0, + "step": 7873 + }, + { + "epoch": 1.0016537336216766, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5733823776245117, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8612078428268433, + "num_tokens": 300404925.0, + "step": 7874 + }, + { + "epoch": 1.0017809439002672, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.5480302572250366, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8742560744285583, + "num_tokens": 300446620.0, + "step": 7875 + }, + { + "epoch": 1.0019081541788577, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6369850635528564, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8714576959609985, + "num_tokens": 300484284.0, + "step": 7876 + }, + { + "epoch": 1.0020353644574482, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.633856177330017, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8600554466247559, + "num_tokens": 300523082.0, + "step": 7877 + }, + { + "epoch": 1.0021625747360388, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6975187063217163, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8736968040466309, + "num_tokens": 300555733.0, + "step": 7878 + }, + { + "epoch": 1.0022897850146293, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.7620774507522583, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8779480457305908, + "num_tokens": 300585379.0, + "step": 7879 + }, + { + "epoch": 1.0024169952932196, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6348627805709839, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8662225008010864, + "num_tokens": 300622518.0, + "step": 7880 + }, + { + "epoch": 1.0025442055718101, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.7587957382202148, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8562777638435364, + "num_tokens": 300657609.0, + "step": 7881 + }, + { + "epoch": 1.0026714158504006, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.6707016229629517, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8702635169029236, + "num_tokens": 300699532.0, + "step": 7882 + }, + { + "epoch": 1.0027986261289912, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.9760030508041382, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8721752166748047, + "num_tokens": 300736820.0, + "step": 7883 + }, + { + "epoch": 1.0029258364075817, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.4837309122085571, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8913552165031433, + "num_tokens": 300776376.0, + "step": 7884 + }, + { + "epoch": 1.0030530466861722, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.7868602275848389, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8554638624191284, + "num_tokens": 300813130.0, + "step": 7885 + }, + { + "epoch": 1.0031802569647628, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6262481212615967, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8630997538566589, + "num_tokens": 300853065.0, + "step": 7886 + }, + { + "epoch": 1.0033074672433533, + "ewc_loss": 1.9311904907226562e-05, + "grad_norm": 1.7060738801956177, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8722630739212036, + "num_tokens": 300894912.0, + "step": 7887 + }, + { + "epoch": 1.0034346775219438, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.56600821018219, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8674240112304688, + "num_tokens": 300935457.0, + "step": 7888 + }, + { + "epoch": 1.0035618878005343, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5815391540527344, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8828628063201904, + "num_tokens": 300976177.0, + "step": 7889 + }, + { + "epoch": 1.0036890980791249, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.636694073677063, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8605510592460632, + "num_tokens": 301016343.0, + "step": 7890 + }, + { + "epoch": 1.0038163083577154, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6089133024215698, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8691007494926453, + "num_tokens": 301055492.0, + "step": 7891 + }, + { + "epoch": 1.0039435186363057, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.618004322052002, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8774245977401733, + "num_tokens": 301093139.0, + "step": 7892 + }, + { + "epoch": 1.0040707289148962, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.703125238418579, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8653275370597839, + "num_tokens": 301131596.0, + "step": 7893 + }, + { + "epoch": 1.0041979391934868, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6095290184020996, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8633029460906982, + "num_tokens": 301167108.0, + "step": 7894 + }, + { + "epoch": 1.0043251494720773, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5077050924301147, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8724034428596497, + "num_tokens": 301208654.0, + "step": 7895 + }, + { + "epoch": 1.0044523597506678, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6784645318984985, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8704675436019897, + "num_tokens": 301246677.0, + "step": 7896 + }, + { + "epoch": 1.0045795700292584, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.8065946102142334, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.867374062538147, + "num_tokens": 301282225.0, + "step": 7897 + }, + { + "epoch": 1.0047067803078489, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.507433295249939, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8704503774642944, + "num_tokens": 301322636.0, + "step": 7898 + }, + { + "epoch": 1.0048339905864394, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5014697313308716, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8631870746612549, + "num_tokens": 301367528.0, + "step": 7899 + }, + { + "epoch": 1.00496120086503, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.548857569694519, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8709863424301147, + "num_tokens": 301409298.0, + "step": 7900 + }, + { + "epoch": 1.0050884111436205, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6903847455978394, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8610523343086243, + "num_tokens": 301442848.0, + "step": 7901 + }, + { + "epoch": 1.005215621422211, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6122713088989258, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8779481053352356, + "num_tokens": 301482814.0, + "step": 7902 + }, + { + "epoch": 1.0053428317008015, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6186869144439697, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8839429020881653, + "num_tokens": 301521590.0, + "step": 7903 + }, + { + "epoch": 1.0054700419793918, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5992238521575928, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8821356296539307, + "num_tokens": 301560710.0, + "step": 7904 + }, + { + "epoch": 1.0055972522579824, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.764165997505188, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8611867427825928, + "num_tokens": 301595872.0, + "step": 7905 + }, + { + "epoch": 1.0057244625365729, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5976874828338623, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8758234977722168, + "num_tokens": 301631654.0, + "step": 7906 + }, + { + "epoch": 1.0058516728151634, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.56718111038208, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8902906179428101, + "num_tokens": 301667537.0, + "step": 7907 + }, + { + "epoch": 1.005978883093754, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6181516647338867, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8580599427223206, + "num_tokens": 301709196.0, + "step": 7908 + }, + { + "epoch": 1.0061060933723445, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6574041843414307, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.876527726650238, + "num_tokens": 301746165.0, + "step": 7909 + }, + { + "epoch": 1.006233303650935, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.8370414972305298, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8669296503067017, + "num_tokens": 301781287.0, + "step": 7910 + }, + { + "epoch": 1.0063605139295255, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6402374505996704, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8774778842926025, + "num_tokens": 301818208.0, + "step": 7911 + }, + { + "epoch": 1.006487724208116, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5870258808135986, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8590049147605896, + "num_tokens": 301859021.0, + "step": 7912 + }, + { + "epoch": 1.0066149344867066, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6496812105178833, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8843926787376404, + "num_tokens": 301896265.0, + "step": 7913 + }, + { + "epoch": 1.006742144765297, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.560755729675293, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8718104362487793, + "num_tokens": 301937092.0, + "step": 7914 + }, + { + "epoch": 1.0068693550438876, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6000336408615112, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8711100816726685, + "num_tokens": 301974595.0, + "step": 7915 + }, + { + "epoch": 1.0069965653224782, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.611000895500183, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8669219017028809, + "num_tokens": 302017216.0, + "step": 7916 + }, + { + "epoch": 1.0071237756010685, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5970665216445923, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8874088525772095, + "num_tokens": 302050811.0, + "step": 7917 + }, + { + "epoch": 1.007250985879659, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5579890012741089, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8734359741210938, + "num_tokens": 302091764.0, + "step": 7918 + }, + { + "epoch": 1.0073781961582495, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6000251770019531, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8679031729698181, + "num_tokens": 302131881.0, + "step": 7919 + }, + { + "epoch": 1.00750540643684, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6449835300445557, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8564177751541138, + "num_tokens": 302168199.0, + "step": 7920 + }, + { + "epoch": 1.0076326167154306, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6631667613983154, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8753520250320435, + "num_tokens": 302206595.0, + "step": 7921 + }, + { + "epoch": 1.0077598269940211, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5835793018341064, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8727437257766724, + "num_tokens": 302241138.0, + "step": 7922 + }, + { + "epoch": 1.0078870372726116, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6705032587051392, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8769425749778748, + "num_tokens": 302282054.0, + "step": 7923 + }, + { + "epoch": 1.0080142475512022, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.7364623546600342, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8609073758125305, + "num_tokens": 302321608.0, + "step": 7924 + }, + { + "epoch": 1.0081414578297927, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5699944496154785, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8721597194671631, + "num_tokens": 302359638.0, + "step": 7925 + }, + { + "epoch": 1.0082686681083832, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.7121959924697876, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8696346282958984, + "num_tokens": 302392131.0, + "step": 7926 + }, + { + "epoch": 1.0083958783869738, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.52212655544281, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8726896643638611, + "num_tokens": 302432145.0, + "step": 7927 + }, + { + "epoch": 1.0085230886655643, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6708941459655762, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8608056306838989, + "num_tokens": 302470029.0, + "step": 7928 + }, + { + "epoch": 1.0086502989441546, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.414254069328308, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8853199481964111, + "num_tokens": 302512579.0, + "step": 7929 + }, + { + "epoch": 1.0087775092227451, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.567399501800537, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.879920482635498, + "num_tokens": 302549549.0, + "step": 7930 + }, + { + "epoch": 1.0089047195013356, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.60802161693573, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8773809671401978, + "num_tokens": 302584073.0, + "step": 7931 + }, + { + "epoch": 1.0090319297799262, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5187069177627563, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8724725246429443, + "num_tokens": 302625846.0, + "step": 7932 + }, + { + "epoch": 1.0091591400585167, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5629513263702393, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8670967817306519, + "num_tokens": 302666414.0, + "step": 7933 + }, + { + "epoch": 1.0092863503371072, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6347801685333252, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8698000311851501, + "num_tokens": 302701493.0, + "step": 7934 + }, + { + "epoch": 1.0094135606156978, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6290816068649292, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8608791828155518, + "num_tokens": 302737548.0, + "step": 7935 + }, + { + "epoch": 1.0095407708942883, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6171553134918213, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8816261291503906, + "num_tokens": 302773855.0, + "step": 7936 + }, + { + "epoch": 1.0096679811728788, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5287483930587769, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8747459053993225, + "num_tokens": 302812704.0, + "step": 7937 + }, + { + "epoch": 1.0097951914514693, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.762161374092102, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.851464033126831, + "num_tokens": 302845767.0, + "step": 7938 + }, + { + "epoch": 1.0099224017300599, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.596901535987854, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8706892728805542, + "num_tokens": 302884689.0, + "step": 7939 + }, + { + "epoch": 1.0100496120086504, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5291035175323486, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.863726019859314, + "num_tokens": 302926921.0, + "step": 7940 + }, + { + "epoch": 1.0101768222872407, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6072137355804443, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8762359619140625, + "num_tokens": 302963665.0, + "step": 7941 + }, + { + "epoch": 1.0103040325658312, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6192864179611206, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8618471622467041, + "num_tokens": 303005132.0, + "step": 7942 + }, + { + "epoch": 1.0104312428444218, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5890430212020874, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8763728141784668, + "num_tokens": 303046753.0, + "step": 7943 + }, + { + "epoch": 1.0105584531230123, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6647603511810303, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8863186836242676, + "num_tokens": 303079260.0, + "step": 7944 + }, + { + "epoch": 1.0106856634016028, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5793696641921997, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.870029628276825, + "num_tokens": 303118187.0, + "step": 7945 + }, + { + "epoch": 1.0108128736801933, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.601608395576477, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8632861971855164, + "num_tokens": 303155358.0, + "step": 7946 + }, + { + "epoch": 1.0109400839587839, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.5388648509979248, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8573405742645264, + "num_tokens": 303198412.0, + "step": 7947 + }, + { + "epoch": 1.0110672942373744, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.7047886848449707, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8531897664070129, + "num_tokens": 303234773.0, + "step": 7948 + }, + { + "epoch": 1.011194504515965, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.7601665258407593, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8668577671051025, + "num_tokens": 303270318.0, + "step": 7949 + }, + { + "epoch": 1.0113217147945555, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.8917765617370605, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8711872100830078, + "num_tokens": 303297457.0, + "step": 7950 + }, + { + "epoch": 1.011448925073146, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6513034105300903, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8477790951728821, + "num_tokens": 303335813.0, + "step": 7951 + }, + { + "epoch": 1.0115761353517365, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6649773120880127, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8501262068748474, + "num_tokens": 303377181.0, + "step": 7952 + }, + { + "epoch": 1.0117033456303268, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.7983261346817017, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8496354818344116, + "num_tokens": 303410440.0, + "step": 7953 + }, + { + "epoch": 1.0118305559089174, + "ewc_loss": 1.9431114196777344e-05, + "grad_norm": 1.6911994218826294, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8770544528961182, + "num_tokens": 303443405.0, + "step": 7954 + }, + { + "epoch": 1.0119577661875079, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.4791100025177002, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8768550753593445, + "num_tokens": 303485333.0, + "step": 7955 + }, + { + "epoch": 1.0120849764660984, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.9698894023895264, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8716229796409607, + "num_tokens": 303513732.0, + "step": 7956 + }, + { + "epoch": 1.012212186744689, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.5811138153076172, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8729063868522644, + "num_tokens": 303550685.0, + "step": 7957 + }, + { + "epoch": 1.0123393970232795, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.7075655460357666, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8699040412902832, + "num_tokens": 303586516.0, + "step": 7958 + }, + { + "epoch": 1.01246660730187, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.8197661638259888, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8751773238182068, + "num_tokens": 303620261.0, + "step": 7959 + }, + { + "epoch": 1.0125938175804605, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.6682378053665161, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8534501194953918, + "num_tokens": 303658681.0, + "step": 7960 + }, + { + "epoch": 1.012721027859051, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.6399080753326416, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8646743297576904, + "num_tokens": 303696236.0, + "step": 7961 + }, + { + "epoch": 1.0128482381376416, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.6136842966079712, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8803402781486511, + "num_tokens": 303733298.0, + "step": 7962 + }, + { + "epoch": 1.012975448416232, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.592253565788269, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8571293354034424, + "num_tokens": 303774256.0, + "step": 7963 + }, + { + "epoch": 1.0131026586948226, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.6614930629730225, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8708828687667847, + "num_tokens": 303812614.0, + "step": 7964 + }, + { + "epoch": 1.0132298689734132, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.7410457134246826, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8512824773788452, + "num_tokens": 303846733.0, + "step": 7965 + }, + { + "epoch": 1.0133570792520035, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.651505708694458, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8775144815444946, + "num_tokens": 303885785.0, + "step": 7966 + }, + { + "epoch": 1.013484289530594, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.611449122428894, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8628929853439331, + "num_tokens": 303924722.0, + "step": 7967 + }, + { + "epoch": 1.0136114998091845, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.6151158809661865, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8589768409729004, + "num_tokens": 303966361.0, + "step": 7968 + }, + { + "epoch": 1.013738710087775, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.5637613534927368, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8703275918960571, + "num_tokens": 304007912.0, + "step": 7969 + }, + { + "epoch": 1.0138659203663656, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.717111587524414, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8660781979560852, + "num_tokens": 304043882.0, + "step": 7970 + }, + { + "epoch": 1.013993130644956, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.5954288244247437, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8808475136756897, + "num_tokens": 304079903.0, + "step": 7971 + }, + { + "epoch": 1.0141203409235466, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.47784423828125, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.867006778717041, + "num_tokens": 304124824.0, + "step": 7972 + }, + { + "epoch": 1.0142475512021372, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.8450886011123657, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8524713516235352, + "num_tokens": 304157970.0, + "step": 7973 + }, + { + "epoch": 1.0143747614807277, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.730527639389038, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8785645961761475, + "num_tokens": 304192366.0, + "step": 7974 + }, + { + "epoch": 1.0145019717593182, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.6986044645309448, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8791557550430298, + "num_tokens": 304226016.0, + "step": 7975 + }, + { + "epoch": 1.0146291820379088, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.5184510946273804, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8864117860794067, + "num_tokens": 304263725.0, + "step": 7976 + }, + { + "epoch": 1.0147563923164993, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.5989645719528198, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8697008490562439, + "num_tokens": 304302573.0, + "step": 7977 + }, + { + "epoch": 1.0148836025950896, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.562566876411438, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8655366897583008, + "num_tokens": 304345987.0, + "step": 7978 + }, + { + "epoch": 1.0150108128736801, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.6293976306915283, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8767266869544983, + "num_tokens": 304378950.0, + "step": 7979 + }, + { + "epoch": 1.0151380231522706, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.5057703256607056, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8841761350631714, + "num_tokens": 304418498.0, + "step": 7980 + }, + { + "epoch": 1.0152652334308612, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.5578417778015137, + "learning_rate": 1e-06, + "loss": 0.3154, + "mean_token_accuracy": 0.8955613374710083, + "num_tokens": 304455175.0, + "step": 7981 + }, + { + "epoch": 1.0153924437094517, + "ewc_loss": 1.9550323486328125e-05, + "grad_norm": 1.588719367980957, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8751180768013, + "num_tokens": 304492342.0, + "step": 7982 + }, + { + "epoch": 1.0155196539880422, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.5387588739395142, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8805379867553711, + "num_tokens": 304531784.0, + "step": 7983 + }, + { + "epoch": 1.0156468642666328, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.6008472442626953, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8803185224533081, + "num_tokens": 304568159.0, + "step": 7984 + }, + { + "epoch": 1.0157740745452233, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.7754355669021606, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8738313913345337, + "num_tokens": 304602273.0, + "step": 7985 + }, + { + "epoch": 1.0159012848238138, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.7827844619750977, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8626233339309692, + "num_tokens": 304636252.0, + "step": 7986 + }, + { + "epoch": 1.0160284951024043, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.4776344299316406, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8871872425079346, + "num_tokens": 304675464.0, + "step": 7987 + }, + { + "epoch": 1.0161557053809949, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.7134507894515991, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8826436996459961, + "num_tokens": 304708675.0, + "step": 7988 + }, + { + "epoch": 1.0162829156595854, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.645760178565979, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8607844710350037, + "num_tokens": 304745015.0, + "step": 7989 + }, + { + "epoch": 1.0164101259381757, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.8458207845687866, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8506762385368347, + "num_tokens": 304779763.0, + "step": 7990 + }, + { + "epoch": 1.0165373362167662, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.553398609161377, + "learning_rate": 1e-06, + "loss": 0.3327, + "mean_token_accuracy": 0.8914222717285156, + "num_tokens": 304815698.0, + "step": 7991 + }, + { + "epoch": 1.0166645464953568, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.5854341983795166, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8805886507034302, + "num_tokens": 304852128.0, + "step": 7992 + }, + { + "epoch": 1.0167917567739473, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.5087852478027344, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8739949464797974, + "num_tokens": 304893675.0, + "step": 7993 + }, + { + "epoch": 1.0169189670525378, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.5505510568618774, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.864799976348877, + "num_tokens": 304933443.0, + "step": 7994 + }, + { + "epoch": 1.0170461773311283, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.4773366451263428, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8804612159729004, + "num_tokens": 304977822.0, + "step": 7995 + }, + { + "epoch": 1.0171733876097189, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.5562688112258911, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8637959957122803, + "num_tokens": 305019296.0, + "step": 7996 + }, + { + "epoch": 1.0173005978883094, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.645930528640747, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8780755996704102, + "num_tokens": 305057645.0, + "step": 7997 + }, + { + "epoch": 1.0174278081669, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.679575800895691, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8662518858909607, + "num_tokens": 305095814.0, + "step": 7998 + }, + { + "epoch": 1.0175550184454905, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.5115524530410767, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8593056201934814, + "num_tokens": 305140003.0, + "step": 7999 + }, + { + "epoch": 1.017682228724081, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.46515691280365, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8753067255020142, + "num_tokens": 305183928.0, + "step": 8000 + }, + { + "epoch": 1.0178094390026715, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.516568899154663, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.866234302520752, + "num_tokens": 305227209.0, + "step": 8001 + }, + { + "epoch": 1.0179366492812618, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.7141380310058594, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.875352680683136, + "num_tokens": 305259431.0, + "step": 8002 + }, + { + "epoch": 1.0180638595598523, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.9147889614105225, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8629251718521118, + "num_tokens": 305294644.0, + "step": 8003 + }, + { + "epoch": 1.0181910698384429, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.7979639768600464, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8768726587295532, + "num_tokens": 305327718.0, + "step": 8004 + }, + { + "epoch": 1.0183182801170334, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6594264507293701, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8714290857315063, + "num_tokens": 305366958.0, + "step": 8005 + }, + { + "epoch": 1.018445490395624, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.6404411792755127, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8657798767089844, + "num_tokens": 305407546.0, + "step": 8006 + }, + { + "epoch": 1.0185727006742145, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.6858404874801636, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8810053467750549, + "num_tokens": 305447525.0, + "step": 8007 + }, + { + "epoch": 1.018699910952805, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.6844185590744019, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8712167739868164, + "num_tokens": 305485274.0, + "step": 8008 + }, + { + "epoch": 1.0188271212313955, + "ewc_loss": 1.9669532775878906e-05, + "grad_norm": 1.5927917957305908, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8686946630477905, + "num_tokens": 305524269.0, + "step": 8009 + }, + { + "epoch": 1.018954331509986, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.571899175643921, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8655428886413574, + "num_tokens": 305565565.0, + "step": 8010 + }, + { + "epoch": 1.0190815417885766, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7193278074264526, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8616307973861694, + "num_tokens": 305604156.0, + "step": 8011 + }, + { + "epoch": 1.019208752067167, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6425197124481201, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8699625730514526, + "num_tokens": 305638951.0, + "step": 8012 + }, + { + "epoch": 1.0193359623457576, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6274681091308594, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8752134442329407, + "num_tokens": 305677338.0, + "step": 8013 + }, + { + "epoch": 1.0194631726243482, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7600935697555542, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8807063698768616, + "num_tokens": 305711133.0, + "step": 8014 + }, + { + "epoch": 1.0195903829029385, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5566203594207764, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8624361753463745, + "num_tokens": 305753668.0, + "step": 8015 + }, + { + "epoch": 1.019717593181529, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6387302875518799, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8703789710998535, + "num_tokens": 305791940.0, + "step": 8016 + }, + { + "epoch": 1.0198448034601195, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6011433601379395, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8797484636306763, + "num_tokens": 305833639.0, + "step": 8017 + }, + { + "epoch": 1.01997201373871, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.591973066329956, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.85167396068573, + "num_tokens": 305874996.0, + "step": 8018 + }, + { + "epoch": 1.0200992240173006, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5276715755462646, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8793841600418091, + "num_tokens": 305914395.0, + "step": 8019 + }, + { + "epoch": 1.020226434295891, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7111268043518066, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8527402877807617, + "num_tokens": 305949996.0, + "step": 8020 + }, + { + "epoch": 1.0203536445744816, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5832018852233887, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8726527690887451, + "num_tokens": 305994074.0, + "step": 8021 + }, + { + "epoch": 1.0204808548530722, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.71208918094635, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8802772760391235, + "num_tokens": 306032077.0, + "step": 8022 + }, + { + "epoch": 1.0206080651316627, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6603789329528809, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8843210935592651, + "num_tokens": 306066746.0, + "step": 8023 + }, + { + "epoch": 1.0207352754102532, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5239168405532837, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8765965104103088, + "num_tokens": 306106347.0, + "step": 8024 + }, + { + "epoch": 1.0208624856888437, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.578063726425171, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8725348711013794, + "num_tokens": 306145490.0, + "step": 8025 + }, + { + "epoch": 1.0209896959674343, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6294364929199219, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8634123206138611, + "num_tokens": 306184328.0, + "step": 8026 + }, + { + "epoch": 1.0211169062460246, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.441209077835083, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8692988753318787, + "num_tokens": 306229612.0, + "step": 8027 + }, + { + "epoch": 1.021244116524615, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6037993431091309, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8755266070365906, + "num_tokens": 306263945.0, + "step": 8028 + }, + { + "epoch": 1.0213713268032056, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6036229133605957, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8623950481414795, + "num_tokens": 306305241.0, + "step": 8029 + }, + { + "epoch": 1.0214985370817962, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6056480407714844, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.863764226436615, + "num_tokens": 306342784.0, + "step": 8030 + }, + { + "epoch": 1.0216257473603867, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6753182411193848, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8712666630744934, + "num_tokens": 306383269.0, + "step": 8031 + }, + { + "epoch": 1.0217529576389772, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7037301063537598, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8648102879524231, + "num_tokens": 306415754.0, + "step": 8032 + }, + { + "epoch": 1.0218801679175677, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.459404706954956, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8790969848632812, + "num_tokens": 306455483.0, + "step": 8033 + }, + { + "epoch": 1.0220073781961583, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5758217573165894, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8569411039352417, + "num_tokens": 306493703.0, + "step": 8034 + }, + { + "epoch": 1.0221345884747488, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6679596900939941, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8649901151657104, + "num_tokens": 306528110.0, + "step": 8035 + }, + { + "epoch": 1.0222617987533393, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.8808571100234985, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8743150234222412, + "num_tokens": 306559663.0, + "step": 8036 + }, + { + "epoch": 1.0223890090319299, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.547167181968689, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8720412254333496, + "num_tokens": 306598465.0, + "step": 8037 + }, + { + "epoch": 1.0225162193105204, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.4975203275680542, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8750594258308411, + "num_tokens": 306639104.0, + "step": 8038 + }, + { + "epoch": 1.0226434295891107, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6025679111480713, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8679095506668091, + "num_tokens": 306677078.0, + "step": 8039 + }, + { + "epoch": 1.0227706398677012, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6935335397720337, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.873422384262085, + "num_tokens": 306711854.0, + "step": 8040 + }, + { + "epoch": 1.0228978501462918, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6874550580978394, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8641039729118347, + "num_tokens": 306750194.0, + "step": 8041 + }, + { + "epoch": 1.0230250604248823, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5583503246307373, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8729202747344971, + "num_tokens": 306789758.0, + "step": 8042 + }, + { + "epoch": 1.0231522707034728, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.634892463684082, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8650102615356445, + "num_tokens": 306826976.0, + "step": 8043 + }, + { + "epoch": 1.0232794809820633, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5439224243164062, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8672218322753906, + "num_tokens": 306868558.0, + "step": 8044 + }, + { + "epoch": 1.0234066912606539, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.4498651027679443, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8713618516921997, + "num_tokens": 306914624.0, + "step": 8045 + }, + { + "epoch": 1.0235339015392444, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.562516450881958, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8660682439804077, + "num_tokens": 306955569.0, + "step": 8046 + }, + { + "epoch": 1.023661111817835, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5558130741119385, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8589071035385132, + "num_tokens": 307001975.0, + "step": 8047 + }, + { + "epoch": 1.0237883220964255, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6044495105743408, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8627454042434692, + "num_tokens": 307039949.0, + "step": 8048 + }, + { + "epoch": 1.023915532375016, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.697540521621704, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8666163682937622, + "num_tokens": 307076241.0, + "step": 8049 + }, + { + "epoch": 1.0240427426536065, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5046076774597168, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8762540817260742, + "num_tokens": 307122638.0, + "step": 8050 + }, + { + "epoch": 1.0241699529321968, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.4847729206085205, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8670925498008728, + "num_tokens": 307163502.0, + "step": 8051 + }, + { + "epoch": 1.0242971632107873, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6672595739364624, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8632371425628662, + "num_tokens": 307201306.0, + "step": 8052 + }, + { + "epoch": 1.0244243734893779, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6377332210540771, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8862695693969727, + "num_tokens": 307233668.0, + "step": 8053 + }, + { + "epoch": 1.0245515837679684, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7216708660125732, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8629772663116455, + "num_tokens": 307270225.0, + "step": 8054 + }, + { + "epoch": 1.024678794046559, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5818166732788086, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8791762590408325, + "num_tokens": 307312006.0, + "step": 8055 + }, + { + "epoch": 1.0248060043251495, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.581904411315918, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8677978515625, + "num_tokens": 307351286.0, + "step": 8056 + }, + { + "epoch": 1.02493321460374, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.567022442817688, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.861767053604126, + "num_tokens": 307396035.0, + "step": 8057 + }, + { + "epoch": 1.0250604248823305, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.8862590789794922, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8678474426269531, + "num_tokens": 307427069.0, + "step": 8058 + }, + { + "epoch": 1.025187635160921, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.672692894935608, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.866397500038147, + "num_tokens": 307465675.0, + "step": 8059 + }, + { + "epoch": 1.0253148454395116, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6152507066726685, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8724945783615112, + "num_tokens": 307505082.0, + "step": 8060 + }, + { + "epoch": 1.025442055718102, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5462816953659058, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8875328898429871, + "num_tokens": 307541042.0, + "step": 8061 + }, + { + "epoch": 1.0255692659966926, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6309645175933838, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.865845263004303, + "num_tokens": 307581957.0, + "step": 8062 + }, + { + "epoch": 1.0256964762752832, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.716934323310852, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8536322116851807, + "num_tokens": 307616180.0, + "step": 8063 + }, + { + "epoch": 1.0258236865538735, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5793753862380981, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8599288463592529, + "num_tokens": 307657807.0, + "step": 8064 + }, + { + "epoch": 1.025950896832464, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7617107629776, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8761652708053589, + "num_tokens": 307689768.0, + "step": 8065 + }, + { + "epoch": 1.0260781071110545, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6989039182662964, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8826225996017456, + "num_tokens": 307722176.0, + "step": 8066 + }, + { + "epoch": 1.026205317389645, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7165963649749756, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8621686697006226, + "num_tokens": 307758986.0, + "step": 8067 + }, + { + "epoch": 1.0263325276682356, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5846621990203857, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.872872531414032, + "num_tokens": 307796299.0, + "step": 8068 + }, + { + "epoch": 1.026459737946826, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.492652177810669, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8882066607475281, + "num_tokens": 307835016.0, + "step": 8069 + }, + { + "epoch": 1.0265869482254166, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7158538103103638, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8672702312469482, + "num_tokens": 307872071.0, + "step": 8070 + }, + { + "epoch": 1.0267141585040072, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.621821641921997, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8709561824798584, + "num_tokens": 307909181.0, + "step": 8071 + }, + { + "epoch": 1.0268413687825977, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6799219846725464, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8646810054779053, + "num_tokens": 307946906.0, + "step": 8072 + }, + { + "epoch": 1.0269685790611882, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.8021196126937866, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8801467418670654, + "num_tokens": 307980386.0, + "step": 8073 + }, + { + "epoch": 1.0270957893397787, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.4920579195022583, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8705296516418457, + "num_tokens": 308021690.0, + "step": 8074 + }, + { + "epoch": 1.0272229996183693, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5508426427841187, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8747982978820801, + "num_tokens": 308059723.0, + "step": 8075 + }, + { + "epoch": 1.0273502098969596, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6590369939804077, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8749908804893494, + "num_tokens": 308100514.0, + "step": 8076 + }, + { + "epoch": 1.02747742017555, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6762276887893677, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.864252507686615, + "num_tokens": 308135521.0, + "step": 8077 + }, + { + "epoch": 1.0276046304541406, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6744667291641235, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8697530031204224, + "num_tokens": 308174008.0, + "step": 8078 + }, + { + "epoch": 1.0277318407327312, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6031794548034668, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8692588806152344, + "num_tokens": 308214504.0, + "step": 8079 + }, + { + "epoch": 1.0278590510113217, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5816882848739624, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8664737939834595, + "num_tokens": 308255878.0, + "step": 8080 + }, + { + "epoch": 1.0279862612899122, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6016077995300293, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8712351322174072, + "num_tokens": 308297889.0, + "step": 8081 + }, + { + "epoch": 1.0281134715685027, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5794477462768555, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8640269041061401, + "num_tokens": 308340515.0, + "step": 8082 + }, + { + "epoch": 1.0282406818470933, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6695255041122437, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8777563571929932, + "num_tokens": 308378869.0, + "step": 8083 + }, + { + "epoch": 1.0283678921256838, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7119611501693726, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8810552358627319, + "num_tokens": 308417531.0, + "step": 8084 + }, + { + "epoch": 1.0284951024042743, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5849440097808838, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8768966197967529, + "num_tokens": 308452559.0, + "step": 8085 + }, + { + "epoch": 1.0286223126828649, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6118311882019043, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8708261847496033, + "num_tokens": 308491375.0, + "step": 8086 + }, + { + "epoch": 1.0287495229614554, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7146965265274048, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8704044818878174, + "num_tokens": 308526611.0, + "step": 8087 + }, + { + "epoch": 1.0288767332400457, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6512945890426636, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8628566265106201, + "num_tokens": 308567298.0, + "step": 8088 + }, + { + "epoch": 1.0290039435186362, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7222336530685425, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8662387132644653, + "num_tokens": 308605925.0, + "step": 8089 + }, + { + "epoch": 1.0291311537972267, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.5695288181304932, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8660581111907959, + "num_tokens": 308648031.0, + "step": 8090 + }, + { + "epoch": 1.0292583640758173, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6074750423431396, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8803505897521973, + "num_tokens": 308683477.0, + "step": 8091 + }, + { + "epoch": 1.0293855743544078, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.473582148551941, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8728724718093872, + "num_tokens": 308725859.0, + "step": 8092 + }, + { + "epoch": 1.0295127846329983, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.666186809539795, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8701291084289551, + "num_tokens": 308763242.0, + "step": 8093 + }, + { + "epoch": 1.0296399949115889, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6560959815979004, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8638955950737, + "num_tokens": 308800687.0, + "step": 8094 + }, + { + "epoch": 1.0297672051901794, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.8533470630645752, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8715726137161255, + "num_tokens": 308842883.0, + "step": 8095 + }, + { + "epoch": 1.02989441546877, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.7137147188186646, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8686641454696655, + "num_tokens": 308878216.0, + "step": 8096 + }, + { + "epoch": 1.0300216257473604, + "ewc_loss": 1.9788742065429688e-05, + "grad_norm": 1.6301642656326294, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8703948259353638, + "num_tokens": 308918904.0, + "step": 8097 + }, + { + "epoch": 1.030148836025951, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6775777339935303, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8757520914077759, + "num_tokens": 308953720.0, + "step": 8098 + }, + { + "epoch": 1.0302760463045415, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6982403993606567, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8792102336883545, + "num_tokens": 308985265.0, + "step": 8099 + }, + { + "epoch": 1.0304032565831318, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6748785972595215, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8492083549499512, + "num_tokens": 309023324.0, + "step": 8100 + }, + { + "epoch": 1.0305304668617223, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.628203272819519, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8681182265281677, + "num_tokens": 309064472.0, + "step": 8101 + }, + { + "epoch": 1.0306576771403129, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6268864870071411, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8577985167503357, + "num_tokens": 309104409.0, + "step": 8102 + }, + { + "epoch": 1.0307848874189034, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5567666292190552, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8847223520278931, + "num_tokens": 309144692.0, + "step": 8103 + }, + { + "epoch": 1.030912097697494, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5499285459518433, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8799750208854675, + "num_tokens": 309185688.0, + "step": 8104 + }, + { + "epoch": 1.0310393079760845, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6083070039749146, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8737308979034424, + "num_tokens": 309224982.0, + "step": 8105 + }, + { + "epoch": 1.031166518254675, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7219332456588745, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8657961487770081, + "num_tokens": 309263395.0, + "step": 8106 + }, + { + "epoch": 1.0312937285332655, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5746876001358032, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8753869533538818, + "num_tokens": 309302835.0, + "step": 8107 + }, + { + "epoch": 1.031420938811856, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6261588335037231, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8790760040283203, + "num_tokens": 309341784.0, + "step": 8108 + }, + { + "epoch": 1.0315481490904466, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.59026038646698, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8735320568084717, + "num_tokens": 309383432.0, + "step": 8109 + }, + { + "epoch": 1.031675359369037, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5891929864883423, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8720920085906982, + "num_tokens": 309420950.0, + "step": 8110 + }, + { + "epoch": 1.0318025696476276, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7857511043548584, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8504269123077393, + "num_tokens": 309456707.0, + "step": 8111 + }, + { + "epoch": 1.0319297799262181, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6893081665039062, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8769897222518921, + "num_tokens": 309490531.0, + "step": 8112 + }, + { + "epoch": 1.0320569902048085, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7025905847549438, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8687683343887329, + "num_tokens": 309524923.0, + "step": 8113 + }, + { + "epoch": 1.032184200483399, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5657331943511963, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8884799480438232, + "num_tokens": 309562769.0, + "step": 8114 + }, + { + "epoch": 1.0323114107619895, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6068260669708252, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.870377242565155, + "num_tokens": 309600046.0, + "step": 8115 + }, + { + "epoch": 1.03243862104058, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.635794758796692, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8577618598937988, + "num_tokens": 309640638.0, + "step": 8116 + }, + { + "epoch": 1.0325658313191706, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6538249254226685, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8833404183387756, + "num_tokens": 309673256.0, + "step": 8117 + }, + { + "epoch": 1.032693041597761, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5199923515319824, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8646101951599121, + "num_tokens": 309712703.0, + "step": 8118 + }, + { + "epoch": 1.0328202518763516, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6571136713027954, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8579413890838623, + "num_tokens": 309749504.0, + "step": 8119 + }, + { + "epoch": 1.0329474621549422, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.700667142868042, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8631142377853394, + "num_tokens": 309783834.0, + "step": 8120 + }, + { + "epoch": 1.0330746724335327, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.529953956604004, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8689249753952026, + "num_tokens": 309826974.0, + "step": 8121 + }, + { + "epoch": 1.0332018827121232, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5581634044647217, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8755295276641846, + "num_tokens": 309866950.0, + "step": 8122 + }, + { + "epoch": 1.0333290929907137, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5768064260482788, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8731825351715088, + "num_tokens": 309905343.0, + "step": 8123 + }, + { + "epoch": 1.0334563032693043, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.592148780822754, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8550996780395508, + "num_tokens": 309947171.0, + "step": 8124 + }, + { + "epoch": 1.0335835135478946, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6774967908859253, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8628098368644714, + "num_tokens": 309979721.0, + "step": 8125 + }, + { + "epoch": 1.033710723826485, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4674843549728394, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8750318288803101, + "num_tokens": 310017060.0, + "step": 8126 + }, + { + "epoch": 1.0338379341050756, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7036677598953247, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8745456337928772, + "num_tokens": 310049599.0, + "step": 8127 + }, + { + "epoch": 1.0339651443836662, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.740106225013733, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8640056252479553, + "num_tokens": 310084841.0, + "step": 8128 + }, + { + "epoch": 1.0340923546622567, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5282617807388306, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8627198338508606, + "num_tokens": 310130433.0, + "step": 8129 + }, + { + "epoch": 1.0342195649408472, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.644429087638855, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8595848083496094, + "num_tokens": 310170716.0, + "step": 8130 + }, + { + "epoch": 1.0343467752194377, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6553928852081299, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8744769096374512, + "num_tokens": 310206493.0, + "step": 8131 + }, + { + "epoch": 1.0344739854980283, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5461008548736572, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8589425086975098, + "num_tokens": 310250806.0, + "step": 8132 + }, + { + "epoch": 1.0346011957766188, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5334734916687012, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8784676790237427, + "num_tokens": 310292346.0, + "step": 8133 + }, + { + "epoch": 1.0347284060552093, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6023145914077759, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8689702749252319, + "num_tokens": 310333503.0, + "step": 8134 + }, + { + "epoch": 1.0348556163337999, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.76411771774292, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8564733266830444, + "num_tokens": 310372818.0, + "step": 8135 + }, + { + "epoch": 1.0349828266123904, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6968836784362793, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8633951544761658, + "num_tokens": 310410031.0, + "step": 8136 + }, + { + "epoch": 1.0351100368909807, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5755977630615234, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8693065643310547, + "num_tokens": 310453014.0, + "step": 8137 + }, + { + "epoch": 1.0352372471695712, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6678963899612427, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8653183579444885, + "num_tokens": 310487631.0, + "step": 8138 + }, + { + "epoch": 1.0353644574481617, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.644266963005066, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8684749603271484, + "num_tokens": 310533285.0, + "step": 8139 + }, + { + "epoch": 1.0354916677267523, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7072410583496094, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.860960841178894, + "num_tokens": 310571787.0, + "step": 8140 + }, + { + "epoch": 1.0356188780053428, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.828797459602356, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8605372905731201, + "num_tokens": 310610937.0, + "step": 8141 + }, + { + "epoch": 1.0357460882839333, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.840275526046753, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8515515327453613, + "num_tokens": 310646277.0, + "step": 8142 + }, + { + "epoch": 1.0358732985625239, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6736252307891846, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.881662905216217, + "num_tokens": 310680358.0, + "step": 8143 + }, + { + "epoch": 1.0360005088411144, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6807656288146973, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8577427268028259, + "num_tokens": 310718469.0, + "step": 8144 + }, + { + "epoch": 1.036127719119705, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6088649034500122, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8733096718788147, + "num_tokens": 310759619.0, + "step": 8145 + }, + { + "epoch": 1.0362549293982954, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6543689966201782, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8867660164833069, + "num_tokens": 310800514.0, + "step": 8146 + }, + { + "epoch": 1.036382139676886, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.631097674369812, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8699533343315125, + "num_tokens": 310842005.0, + "step": 8147 + }, + { + "epoch": 1.0365093499554765, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.8449150323867798, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8676204681396484, + "num_tokens": 310873532.0, + "step": 8148 + }, + { + "epoch": 1.0366365602340668, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6116927862167358, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8558400869369507, + "num_tokens": 310913839.0, + "step": 8149 + }, + { + "epoch": 1.0367637705126573, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6691533327102661, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8609251379966736, + "num_tokens": 310952975.0, + "step": 8150 + }, + { + "epoch": 1.0368909807912479, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5507415533065796, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8753331899642944, + "num_tokens": 310992806.0, + "step": 8151 + }, + { + "epoch": 1.0370181910698384, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6289254426956177, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8698102831840515, + "num_tokens": 311028208.0, + "step": 8152 + }, + { + "epoch": 1.037145401348429, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7138723134994507, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8787722587585449, + "num_tokens": 311064498.0, + "step": 8153 + }, + { + "epoch": 1.0372726116270194, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.590938687324524, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8598589897155762, + "num_tokens": 311102968.0, + "step": 8154 + }, + { + "epoch": 1.03739982190561, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.726457953453064, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8632597327232361, + "num_tokens": 311136382.0, + "step": 8155 + }, + { + "epoch": 1.0375270321842005, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.57667076587677, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8688368797302246, + "num_tokens": 311175824.0, + "step": 8156 + }, + { + "epoch": 1.037654242462791, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5952939987182617, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8743157982826233, + "num_tokens": 311212377.0, + "step": 8157 + }, + { + "epoch": 1.0377814527413816, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6185722351074219, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8668818473815918, + "num_tokens": 311247772.0, + "step": 8158 + }, + { + "epoch": 1.037908663019972, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6042678356170654, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8696504831314087, + "num_tokens": 311288462.0, + "step": 8159 + }, + { + "epoch": 1.0380358732985626, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.463055968284607, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8861147165298462, + "num_tokens": 311330506.0, + "step": 8160 + }, + { + "epoch": 1.0381630835771531, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5728245973587036, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8793390989303589, + "num_tokens": 311367710.0, + "step": 8161 + }, + { + "epoch": 1.0382902938557435, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4422674179077148, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8802500367164612, + "num_tokens": 311409065.0, + "step": 8162 + }, + { + "epoch": 1.038417504134334, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6801048517227173, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8673085570335388, + "num_tokens": 311444227.0, + "step": 8163 + }, + { + "epoch": 1.0385447144129245, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.634155035018921, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8794633150100708, + "num_tokens": 311483393.0, + "step": 8164 + }, + { + "epoch": 1.038671924691515, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5614224672317505, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8783435225486755, + "num_tokens": 311524112.0, + "step": 8165 + }, + { + "epoch": 1.0387991349701056, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6900111436843872, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8629094362258911, + "num_tokens": 311560865.0, + "step": 8166 + }, + { + "epoch": 1.038926345248696, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6510505676269531, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8610382080078125, + "num_tokens": 311600637.0, + "step": 8167 + }, + { + "epoch": 1.0390535555272866, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7550227642059326, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8755732774734497, + "num_tokens": 311633216.0, + "step": 8168 + }, + { + "epoch": 1.0391807658058771, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6352037191390991, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8566869497299194, + "num_tokens": 311671361.0, + "step": 8169 + }, + { + "epoch": 1.0393079760844677, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.566442608833313, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8681401014328003, + "num_tokens": 311711119.0, + "step": 8170 + }, + { + "epoch": 1.0394351863630582, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.641554832458496, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8622114062309265, + "num_tokens": 311749710.0, + "step": 8171 + }, + { + "epoch": 1.0395623966416487, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5114213228225708, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8777371644973755, + "num_tokens": 311791614.0, + "step": 8172 + }, + { + "epoch": 1.0396896069202393, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5151773691177368, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8724342584609985, + "num_tokens": 311832898.0, + "step": 8173 + }, + { + "epoch": 1.0398168171988296, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5417325496673584, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.869185209274292, + "num_tokens": 311874984.0, + "step": 8174 + }, + { + "epoch": 1.03994402747742, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7766029834747314, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8688331842422485, + "num_tokens": 311908653.0, + "step": 8175 + }, + { + "epoch": 1.0400712377560106, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.534732460975647, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8814443349838257, + "num_tokens": 311947807.0, + "step": 8176 + }, + { + "epoch": 1.0401984480346012, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.619059681892395, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8650410175323486, + "num_tokens": 311985041.0, + "step": 8177 + }, + { + "epoch": 1.0403256583131917, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.759711742401123, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8622097969055176, + "num_tokens": 312021551.0, + "step": 8178 + }, + { + "epoch": 1.0404528685917822, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.540225625038147, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8515793085098267, + "num_tokens": 312064678.0, + "step": 8179 + }, + { + "epoch": 1.0405800788703727, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7139312028884888, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8648562431335449, + "num_tokens": 312099581.0, + "step": 8180 + }, + { + "epoch": 1.0407072891489633, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7664042711257935, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8733775019645691, + "num_tokens": 312131533.0, + "step": 8181 + }, + { + "epoch": 1.0408344994275538, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6252641677856445, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8547472953796387, + "num_tokens": 312174383.0, + "step": 8182 + }, + { + "epoch": 1.0409617097061443, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.723749041557312, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8549573421478271, + "num_tokens": 312213515.0, + "step": 8183 + }, + { + "epoch": 1.0410889199847349, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5776256322860718, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8739244341850281, + "num_tokens": 312253168.0, + "step": 8184 + }, + { + "epoch": 1.0412161302633254, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5594981908798218, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8502397537231445, + "num_tokens": 312295409.0, + "step": 8185 + }, + { + "epoch": 1.0413433405419157, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4542239904403687, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8849179148674011, + "num_tokens": 312336914.0, + "step": 8186 + }, + { + "epoch": 1.0414705508205062, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5819013118743896, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8715448379516602, + "num_tokens": 312379660.0, + "step": 8187 + }, + { + "epoch": 1.0415977610990967, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6326297521591187, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8864485025405884, + "num_tokens": 312414960.0, + "step": 8188 + }, + { + "epoch": 1.0417249713776873, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6356955766677856, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.855365514755249, + "num_tokens": 312451503.0, + "step": 8189 + }, + { + "epoch": 1.0418521816562778, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5834639072418213, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8823095560073853, + "num_tokens": 312486105.0, + "step": 8190 + }, + { + "epoch": 1.0419793919348683, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6343406438827515, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8752665519714355, + "num_tokens": 312522520.0, + "step": 8191 + }, + { + "epoch": 1.0421066022134589, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.610992431640625, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8806055784225464, + "num_tokens": 312560063.0, + "step": 8192 + }, + { + "epoch": 1.0422338124920494, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4195078611373901, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8828681707382202, + "num_tokens": 312603613.0, + "step": 8193 + }, + { + "epoch": 1.04236102277064, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5847141742706299, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8636627197265625, + "num_tokens": 312646028.0, + "step": 8194 + }, + { + "epoch": 1.0424882330492304, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.573285460472107, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8780614137649536, + "num_tokens": 312681747.0, + "step": 8195 + }, + { + "epoch": 1.042615443327821, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6645430326461792, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8761347532272339, + "num_tokens": 312718874.0, + "step": 8196 + }, + { + "epoch": 1.0427426536064115, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.580194354057312, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8581372499465942, + "num_tokens": 312761565.0, + "step": 8197 + }, + { + "epoch": 1.0428698638850018, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6592743396759033, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8701032400131226, + "num_tokens": 312798484.0, + "step": 8198 + }, + { + "epoch": 1.0429970741635923, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6779277324676514, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8589661121368408, + "num_tokens": 312835403.0, + "step": 8199 + }, + { + "epoch": 1.0431242844421829, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6898353099822998, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8680423498153687, + "num_tokens": 312870970.0, + "step": 8200 + }, + { + "epoch": 1.0432514947207734, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5664218664169312, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8756042718887329, + "num_tokens": 312911520.0, + "step": 8201 + }, + { + "epoch": 1.043378704999364, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6691362857818604, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8810043334960938, + "num_tokens": 312947221.0, + "step": 8202 + }, + { + "epoch": 1.0435059152779544, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7446221113204956, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8758625984191895, + "num_tokens": 312986634.0, + "step": 8203 + }, + { + "epoch": 1.043633125556545, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5501091480255127, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8772295713424683, + "num_tokens": 313023248.0, + "step": 8204 + }, + { + "epoch": 1.0437603358351355, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.560896873474121, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8675810098648071, + "num_tokens": 313064245.0, + "step": 8205 + }, + { + "epoch": 1.043887546113726, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7394527196884155, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8724707961082458, + "num_tokens": 313098352.0, + "step": 8206 + }, + { + "epoch": 1.0440147563923166, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6311075687408447, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8815966844558716, + "num_tokens": 313134346.0, + "step": 8207 + }, + { + "epoch": 1.044141966670907, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6618781089782715, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8749461770057678, + "num_tokens": 313171095.0, + "step": 8208 + }, + { + "epoch": 1.0442691769494976, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6973644495010376, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8674588203430176, + "num_tokens": 313208001.0, + "step": 8209 + }, + { + "epoch": 1.0443963872280881, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7186601161956787, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8764778971672058, + "num_tokens": 313240029.0, + "step": 8210 + }, + { + "epoch": 1.0445235975066784, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6911652088165283, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8666455745697021, + "num_tokens": 313274684.0, + "step": 8211 + }, + { + "epoch": 1.044650807785269, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.554771900177002, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.872834324836731, + "num_tokens": 313318399.0, + "step": 8212 + }, + { + "epoch": 1.0447780180638595, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6111595630645752, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.878830075263977, + "num_tokens": 313356108.0, + "step": 8213 + }, + { + "epoch": 1.04490522834245, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4630852937698364, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8769476413726807, + "num_tokens": 313397231.0, + "step": 8214 + }, + { + "epoch": 1.0450324386210406, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7557356357574463, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8753774762153625, + "num_tokens": 313430951.0, + "step": 8215 + }, + { + "epoch": 1.045159648899631, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5525444746017456, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8744207620620728, + "num_tokens": 313469455.0, + "step": 8216 + }, + { + "epoch": 1.0452868591782216, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.693642020225525, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8619406819343567, + "num_tokens": 313505845.0, + "step": 8217 + }, + { + "epoch": 1.0454140694568121, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.566174864768982, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8604477643966675, + "num_tokens": 313547401.0, + "step": 8218 + }, + { + "epoch": 1.0455412797354027, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5382484197616577, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8752005100250244, + "num_tokens": 313586596.0, + "step": 8219 + }, + { + "epoch": 1.0456684900139932, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4865609407424927, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.875807523727417, + "num_tokens": 313629583.0, + "step": 8220 + }, + { + "epoch": 1.0457957002925837, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5665534734725952, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8712278604507446, + "num_tokens": 313665549.0, + "step": 8221 + }, + { + "epoch": 1.0459229105711743, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5447819232940674, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8687351942062378, + "num_tokens": 313705991.0, + "step": 8222 + }, + { + "epoch": 1.0460501208497646, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.744950771331787, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8716931939125061, + "num_tokens": 313739308.0, + "step": 8223 + }, + { + "epoch": 1.046177331128355, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6383013725280762, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8742529153823853, + "num_tokens": 313773638.0, + "step": 8224 + }, + { + "epoch": 1.0463045414069456, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 2.0388216972351074, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.854469895362854, + "num_tokens": 313810781.0, + "step": 8225 + }, + { + "epoch": 1.0464317516855361, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4931789636611938, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8773934245109558, + "num_tokens": 313851925.0, + "step": 8226 + }, + { + "epoch": 1.0465589619641267, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4422959089279175, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.876420259475708, + "num_tokens": 313896304.0, + "step": 8227 + }, + { + "epoch": 1.0466861722427172, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5281586647033691, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8780383467674255, + "num_tokens": 313939484.0, + "step": 8228 + }, + { + "epoch": 1.0468133825213077, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5705918073654175, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8764155507087708, + "num_tokens": 313977739.0, + "step": 8229 + }, + { + "epoch": 1.0469405927998983, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.615014672279358, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.872573733329773, + "num_tokens": 314020895.0, + "step": 8230 + }, + { + "epoch": 1.0470678030784888, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5665732622146606, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8608871698379517, + "num_tokens": 314064972.0, + "step": 8231 + }, + { + "epoch": 1.0471950133570793, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.566863775253296, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8694646954536438, + "num_tokens": 314104935.0, + "step": 8232 + }, + { + "epoch": 1.0473222236356698, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6612375974655151, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8820092082023621, + "num_tokens": 314140576.0, + "step": 8233 + }, + { + "epoch": 1.0474494339142604, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6232751607894897, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8609240055084229, + "num_tokens": 314181274.0, + "step": 8234 + }, + { + "epoch": 1.0475766441928507, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6622157096862793, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8751595616340637, + "num_tokens": 314220688.0, + "step": 8235 + }, + { + "epoch": 1.0477038544714412, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4473178386688232, + "learning_rate": 1e-06, + "loss": 0.3096, + "mean_token_accuracy": 0.8968798518180847, + "num_tokens": 314259355.0, + "step": 8236 + }, + { + "epoch": 1.0478310647500317, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.580676794052124, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8757229447364807, + "num_tokens": 314295190.0, + "step": 8237 + }, + { + "epoch": 1.0479582750286223, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.648392915725708, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8656498789787292, + "num_tokens": 314332926.0, + "step": 8238 + }, + { + "epoch": 1.0480854853072128, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6046311855316162, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8430176377296448, + "num_tokens": 314377731.0, + "step": 8239 + }, + { + "epoch": 1.0482126955858033, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6358630657196045, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8559091687202454, + "num_tokens": 314412745.0, + "step": 8240 + }, + { + "epoch": 1.0483399058643939, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.664964199066162, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8764420747756958, + "num_tokens": 314448101.0, + "step": 8241 + }, + { + "epoch": 1.0484671161429844, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5950210094451904, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8707147836685181, + "num_tokens": 314486152.0, + "step": 8242 + }, + { + "epoch": 1.048594326421575, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5795892477035522, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8670905828475952, + "num_tokens": 314523690.0, + "step": 8243 + }, + { + "epoch": 1.0487215367001654, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5889261960983276, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8794385194778442, + "num_tokens": 314562454.0, + "step": 8244 + }, + { + "epoch": 1.048848746978756, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7116198539733887, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8742424249649048, + "num_tokens": 314598358.0, + "step": 8245 + }, + { + "epoch": 1.0489759572573465, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.647268295288086, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8513681888580322, + "num_tokens": 314634924.0, + "step": 8246 + }, + { + "epoch": 1.0491031675359368, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.625522255897522, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8657143115997314, + "num_tokens": 314675497.0, + "step": 8247 + }, + { + "epoch": 1.0492303778145273, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6825594902038574, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8710837364196777, + "num_tokens": 314712357.0, + "step": 8248 + }, + { + "epoch": 1.0493575880931179, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7531276941299438, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8706820011138916, + "num_tokens": 314747234.0, + "step": 8249 + }, + { + "epoch": 1.0494847983717084, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.5475744009017944, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8578537702560425, + "num_tokens": 314786980.0, + "step": 8250 + }, + { + "epoch": 1.049612008650299, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7084860801696777, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8620123863220215, + "num_tokens": 314820915.0, + "step": 8251 + }, + { + "epoch": 1.0497392189288894, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5450702905654907, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8755539655685425, + "num_tokens": 314860256.0, + "step": 8252 + }, + { + "epoch": 1.04986642920748, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6836607456207275, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8604228496551514, + "num_tokens": 314895518.0, + "step": 8253 + }, + { + "epoch": 1.0499936394860705, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6180466413497925, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8702585697174072, + "num_tokens": 314930995.0, + "step": 8254 + }, + { + "epoch": 1.050120849764661, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6369074583053589, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8612110018730164, + "num_tokens": 314967380.0, + "step": 8255 + }, + { + "epoch": 1.0502480600432516, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6326850652694702, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8712388873100281, + "num_tokens": 315004935.0, + "step": 8256 + }, + { + "epoch": 1.050375270321842, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5986536741256714, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.877922773361206, + "num_tokens": 315039670.0, + "step": 8257 + }, + { + "epoch": 1.0505024806004326, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7096062898635864, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8769930601119995, + "num_tokens": 315073648.0, + "step": 8258 + }, + { + "epoch": 1.0506296908790231, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5797098875045776, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8669067621231079, + "num_tokens": 315114293.0, + "step": 8259 + }, + { + "epoch": 1.0507569011576134, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7281736135482788, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8644009232521057, + "num_tokens": 315151085.0, + "step": 8260 + }, + { + "epoch": 1.050884111436204, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7000783681869507, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8597691059112549, + "num_tokens": 315189662.0, + "step": 8261 + }, + { + "epoch": 1.0510113217147945, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4953620433807373, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8830891847610474, + "num_tokens": 315228333.0, + "step": 8262 + }, + { + "epoch": 1.051138531993385, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.649735689163208, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8702956438064575, + "num_tokens": 315262974.0, + "step": 8263 + }, + { + "epoch": 1.0512657422719756, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6881015300750732, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.873155415058136, + "num_tokens": 315296998.0, + "step": 8264 + }, + { + "epoch": 1.051392952550566, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.662831425666809, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.876264750957489, + "num_tokens": 315333729.0, + "step": 8265 + }, + { + "epoch": 1.0515201628291566, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7373133897781372, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8647136688232422, + "num_tokens": 315369328.0, + "step": 8266 + }, + { + "epoch": 1.0516473731077471, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6948188543319702, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8710851669311523, + "num_tokens": 315403946.0, + "step": 8267 + }, + { + "epoch": 1.0517745833863377, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6356884241104126, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8758617639541626, + "num_tokens": 315441918.0, + "step": 8268 + }, + { + "epoch": 1.0519017936649282, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6960668563842773, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8612241744995117, + "num_tokens": 315476560.0, + "step": 8269 + }, + { + "epoch": 1.0520290039435187, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5790108442306519, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8692202568054199, + "num_tokens": 315519118.0, + "step": 8270 + }, + { + "epoch": 1.0521562142221093, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.809566855430603, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8598592281341553, + "num_tokens": 315553549.0, + "step": 8271 + }, + { + "epoch": 1.0522834245006996, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6441055536270142, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8798632621765137, + "num_tokens": 315591546.0, + "step": 8272 + }, + { + "epoch": 1.05241063477929, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6545872688293457, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8619897365570068, + "num_tokens": 315631803.0, + "step": 8273 + }, + { + "epoch": 1.0525378450578806, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.532513976097107, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8742631673812866, + "num_tokens": 315671000.0, + "step": 8274 + }, + { + "epoch": 1.0526650553364711, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5788294076919556, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8759602308273315, + "num_tokens": 315706998.0, + "step": 8275 + }, + { + "epoch": 1.0527922656150617, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6058965921401978, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8666036128997803, + "num_tokens": 315747293.0, + "step": 8276 + }, + { + "epoch": 1.0529194758936522, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4171675443649292, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8751975297927856, + "num_tokens": 315793849.0, + "step": 8277 + }, + { + "epoch": 1.0530466861722427, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.79428231716156, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.858674168586731, + "num_tokens": 315828515.0, + "step": 8278 + }, + { + "epoch": 1.0531738964508333, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6734914779663086, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8637399673461914, + "num_tokens": 315864252.0, + "step": 8279 + }, + { + "epoch": 1.0533011067294238, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5736819505691528, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8748307824134827, + "num_tokens": 315903179.0, + "step": 8280 + }, + { + "epoch": 1.0534283170080143, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.5779595375061035, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8560205698013306, + "num_tokens": 315946267.0, + "step": 8281 + }, + { + "epoch": 1.0535555272866048, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6324071884155273, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8675819635391235, + "num_tokens": 315982055.0, + "step": 8282 + }, + { + "epoch": 1.0536827375651954, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6091464757919312, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8800166845321655, + "num_tokens": 316018038.0, + "step": 8283 + }, + { + "epoch": 1.0538099478437857, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.4492394924163818, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8777892589569092, + "num_tokens": 316061370.0, + "step": 8284 + }, + { + "epoch": 1.0539371581223762, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5682473182678223, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8626911044120789, + "num_tokens": 316102764.0, + "step": 8285 + }, + { + "epoch": 1.0540643684009667, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.511824607849121, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.875203013420105, + "num_tokens": 316141352.0, + "step": 8286 + }, + { + "epoch": 1.0541915786795573, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5510138273239136, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8790002465248108, + "num_tokens": 316180106.0, + "step": 8287 + }, + { + "epoch": 1.0543187889581478, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5027318000793457, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8789417743682861, + "num_tokens": 316222017.0, + "step": 8288 + }, + { + "epoch": 1.0544459992367383, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.4703806638717651, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8616392612457275, + "num_tokens": 316267888.0, + "step": 8289 + }, + { + "epoch": 1.0545732095153288, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.599066138267517, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8673243522644043, + "num_tokens": 316307668.0, + "step": 8290 + }, + { + "epoch": 1.0547004197939194, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.5982962846755981, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8758333921432495, + "num_tokens": 316346485.0, + "step": 8291 + }, + { + "epoch": 1.05482763007251, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.721724033355713, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8821691274642944, + "num_tokens": 316378865.0, + "step": 8292 + }, + { + "epoch": 1.0549548403511004, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.5856012105941772, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8642752170562744, + "num_tokens": 316417600.0, + "step": 8293 + }, + { + "epoch": 1.055082050629691, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.614794373512268, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8802708387374878, + "num_tokens": 316453754.0, + "step": 8294 + }, + { + "epoch": 1.0552092609082815, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7362658977508545, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8760972023010254, + "num_tokens": 316485043.0, + "step": 8295 + }, + { + "epoch": 1.0553364711868718, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6127432584762573, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8838498592376709, + "num_tokens": 316520097.0, + "step": 8296 + }, + { + "epoch": 1.0554636814654623, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.7161990404129028, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8578563928604126, + "num_tokens": 316559314.0, + "step": 8297 + }, + { + "epoch": 1.0555908917440529, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.596769094467163, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8626641631126404, + "num_tokens": 316599090.0, + "step": 8298 + }, + { + "epoch": 1.0557181020226434, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6044224500656128, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8617942333221436, + "num_tokens": 316640818.0, + "step": 8299 + }, + { + "epoch": 1.055845312301234, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6117463111877441, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8728328943252563, + "num_tokens": 316681271.0, + "step": 8300 + }, + { + "epoch": 1.0559725225798244, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.652433156967163, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8765004277229309, + "num_tokens": 316715326.0, + "step": 8301 + }, + { + "epoch": 1.056099732858415, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6457751989364624, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8680097460746765, + "num_tokens": 316751733.0, + "step": 8302 + }, + { + "epoch": 1.0562269431370055, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6710525751113892, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8686786890029907, + "num_tokens": 316788938.0, + "step": 8303 + }, + { + "epoch": 1.056354153415596, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.6510075330734253, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8677263259887695, + "num_tokens": 316827342.0, + "step": 8304 + }, + { + "epoch": 1.0564813636941865, + "ewc_loss": 2.002716064453125e-05, + "grad_norm": 1.686495304107666, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8669764995574951, + "num_tokens": 316869699.0, + "step": 8305 + }, + { + "epoch": 1.056608573972777, + "ewc_loss": 1.990795135498047e-05, + "grad_norm": 1.8056116104125977, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8718358278274536, + "num_tokens": 316900278.0, + "step": 8306 + }, + { + "epoch": 1.0567357842513676, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.8900731801986694, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8433095216751099, + "num_tokens": 316936999.0, + "step": 8307 + }, + { + "epoch": 1.0568629945299581, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5888988971710205, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8780576586723328, + "num_tokens": 316978179.0, + "step": 8308 + }, + { + "epoch": 1.0569902048085484, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6255232095718384, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8616412281990051, + "num_tokens": 317020384.0, + "step": 8309 + }, + { + "epoch": 1.057117415087139, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5634504556655884, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8693737387657166, + "num_tokens": 317061370.0, + "step": 8310 + }, + { + "epoch": 1.0572446253657295, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.778407335281372, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8627872467041016, + "num_tokens": 317093875.0, + "step": 8311 + }, + { + "epoch": 1.05737183564432, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.693111777305603, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8690251708030701, + "num_tokens": 317128995.0, + "step": 8312 + }, + { + "epoch": 1.0574990459229106, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.555687427520752, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8724563121795654, + "num_tokens": 317167888.0, + "step": 8313 + }, + { + "epoch": 1.057626256201501, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.6423741579055786, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8692958950996399, + "num_tokens": 317208874.0, + "step": 8314 + }, + { + "epoch": 1.0577534664800916, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.8158726692199707, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8735729455947876, + "num_tokens": 317244495.0, + "step": 8315 + }, + { + "epoch": 1.0578806767586821, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.6803464889526367, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8685961961746216, + "num_tokens": 317283187.0, + "step": 8316 + }, + { + "epoch": 1.0580078870372727, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.6839765310287476, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8631436824798584, + "num_tokens": 317325922.0, + "step": 8317 + }, + { + "epoch": 1.0581350973158632, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.732810139656067, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8714820146560669, + "num_tokens": 317359769.0, + "step": 8318 + }, + { + "epoch": 1.0582623075944537, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.6318950653076172, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8651421070098877, + "num_tokens": 317399328.0, + "step": 8319 + }, + { + "epoch": 1.058389517873044, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.6218477487564087, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.877179741859436, + "num_tokens": 317437115.0, + "step": 8320 + }, + { + "epoch": 1.0585167281516346, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.4982999563217163, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8558553457260132, + "num_tokens": 317486264.0, + "step": 8321 + }, + { + "epoch": 1.058643938430225, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.609945297241211, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8775816559791565, + "num_tokens": 317521608.0, + "step": 8322 + }, + { + "epoch": 1.0587711487088156, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5203739404678345, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8630527257919312, + "num_tokens": 317563831.0, + "step": 8323 + }, + { + "epoch": 1.0588983589874061, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6012489795684814, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8757559657096863, + "num_tokens": 317599599.0, + "step": 8324 + }, + { + "epoch": 1.0590255692659967, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.827207088470459, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.854331374168396, + "num_tokens": 317634225.0, + "step": 8325 + }, + { + "epoch": 1.0591527795445872, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6446284055709839, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8621077537536621, + "num_tokens": 317672002.0, + "step": 8326 + }, + { + "epoch": 1.0592799898231777, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.4559141397476196, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8697810173034668, + "num_tokens": 317716705.0, + "step": 8327 + }, + { + "epoch": 1.0594072001017683, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6510624885559082, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8672945499420166, + "num_tokens": 317753343.0, + "step": 8328 + }, + { + "epoch": 1.0595344103803588, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.654210090637207, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8630664348602295, + "num_tokens": 317789932.0, + "step": 8329 + }, + { + "epoch": 1.0596616206589493, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.538358449935913, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.862971305847168, + "num_tokens": 317829984.0, + "step": 8330 + }, + { + "epoch": 1.0597888309375398, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5874971151351929, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8783248662948608, + "num_tokens": 317869273.0, + "step": 8331 + }, + { + "epoch": 1.0599160412161304, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5284041166305542, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8726238012313843, + "num_tokens": 317907236.0, + "step": 8332 + }, + { + "epoch": 1.0600432514947207, + "ewc_loss": 2.014636993408203e-05, + "grad_norm": 1.6906378269195557, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.873665452003479, + "num_tokens": 317941719.0, + "step": 8333 + }, + { + "epoch": 1.0601704617733112, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.7450718879699707, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8654940724372864, + "num_tokens": 317977538.0, + "step": 8334 + }, + { + "epoch": 1.0602976720519017, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5577778816223145, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8679300546646118, + "num_tokens": 318017362.0, + "step": 8335 + }, + { + "epoch": 1.0604248823304923, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.55801522731781, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8661773800849915, + "num_tokens": 318059943.0, + "step": 8336 + }, + { + "epoch": 1.0605520926090828, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5437779426574707, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.871261715888977, + "num_tokens": 318104200.0, + "step": 8337 + }, + { + "epoch": 1.0606793028876733, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.529160499572754, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.882185697555542, + "num_tokens": 318145457.0, + "step": 8338 + }, + { + "epoch": 1.0608065131662638, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.586443543434143, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8664523363113403, + "num_tokens": 318183895.0, + "step": 8339 + }, + { + "epoch": 1.0609337234448544, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6571316719055176, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.875877320766449, + "num_tokens": 318220110.0, + "step": 8340 + }, + { + "epoch": 1.061060933723445, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5508428812026978, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8742364645004272, + "num_tokens": 318261995.0, + "step": 8341 + }, + { + "epoch": 1.0611881440020354, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.7684576511383057, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8629928827285767, + "num_tokens": 318296435.0, + "step": 8342 + }, + { + "epoch": 1.061315354280626, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.515297770500183, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8853657245635986, + "num_tokens": 318333211.0, + "step": 8343 + }, + { + "epoch": 1.0614425645592165, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5204808712005615, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8868189454078674, + "num_tokens": 318372357.0, + "step": 8344 + }, + { + "epoch": 1.0615697748378068, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6293504238128662, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.884290337562561, + "num_tokens": 318406396.0, + "step": 8345 + }, + { + "epoch": 1.0616969851163973, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.7199671268463135, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8560830354690552, + "num_tokens": 318445089.0, + "step": 8346 + }, + { + "epoch": 1.0618241953949878, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.707162857055664, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.874627947807312, + "num_tokens": 318481166.0, + "step": 8347 + }, + { + "epoch": 1.0619514056735784, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5591949224472046, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8685794472694397, + "num_tokens": 318524184.0, + "step": 8348 + }, + { + "epoch": 1.062078615952169, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6765388250350952, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8623725771903992, + "num_tokens": 318561238.0, + "step": 8349 + }, + { + "epoch": 1.0622058262307594, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.7715914249420166, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8505065441131592, + "num_tokens": 318595507.0, + "step": 8350 + }, + { + "epoch": 1.06233303650935, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6252546310424805, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.875870943069458, + "num_tokens": 318630811.0, + "step": 8351 + }, + { + "epoch": 1.0624602467879405, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6261721849441528, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8674241304397583, + "num_tokens": 318667935.0, + "step": 8352 + }, + { + "epoch": 1.062587457066531, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.582553505897522, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8801287412643433, + "num_tokens": 318704993.0, + "step": 8353 + }, + { + "epoch": 1.0627146673451215, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6069978475570679, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8580399751663208, + "num_tokens": 318741882.0, + "step": 8354 + }, + { + "epoch": 1.062841877623712, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5123215913772583, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8760021328926086, + "num_tokens": 318778294.0, + "step": 8355 + }, + { + "epoch": 1.0629690879023026, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.785086750984192, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8591504096984863, + "num_tokens": 318812721.0, + "step": 8356 + }, + { + "epoch": 1.0630962981808931, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.7249332666397095, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.872144341468811, + "num_tokens": 318850344.0, + "step": 8357 + }, + { + "epoch": 1.0632235084594834, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.414829969406128, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8791996240615845, + "num_tokens": 318891582.0, + "step": 8358 + }, + { + "epoch": 1.063350718738074, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5572423934936523, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8738200664520264, + "num_tokens": 318932848.0, + "step": 8359 + }, + { + "epoch": 1.0634779290166645, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5101386308670044, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8706023693084717, + "num_tokens": 318975945.0, + "step": 8360 + }, + { + "epoch": 1.063605139295255, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5059565305709839, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8636077642440796, + "num_tokens": 319020530.0, + "step": 8361 + }, + { + "epoch": 1.0637323495738455, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6328681707382202, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8700582981109619, + "num_tokens": 319056953.0, + "step": 8362 + }, + { + "epoch": 1.063859559852436, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6352627277374268, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8831168413162231, + "num_tokens": 319092355.0, + "step": 8363 + }, + { + "epoch": 1.0639867701310266, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.566697597503662, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.871124267578125, + "num_tokens": 319134236.0, + "step": 8364 + }, + { + "epoch": 1.0641139804096171, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5067534446716309, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.882082462310791, + "num_tokens": 319174483.0, + "step": 8365 + }, + { + "epoch": 1.0642411906882077, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5329701900482178, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8805013298988342, + "num_tokens": 319212425.0, + "step": 8366 + }, + { + "epoch": 1.0643684009667982, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.7932523488998413, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8669506311416626, + "num_tokens": 319247387.0, + "step": 8367 + }, + { + "epoch": 1.0644956112453887, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5870146751403809, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8654849529266357, + "num_tokens": 319288268.0, + "step": 8368 + }, + { + "epoch": 1.064622821523979, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6491339206695557, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8597116470336914, + "num_tokens": 319324809.0, + "step": 8369 + }, + { + "epoch": 1.0647500318025696, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6079367399215698, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.880612850189209, + "num_tokens": 319360060.0, + "step": 8370 + }, + { + "epoch": 1.06487724208116, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.692968726158142, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8684221506118774, + "num_tokens": 319394343.0, + "step": 8371 + }, + { + "epoch": 1.0650044523597506, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.660122036933899, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8507257699966431, + "num_tokens": 319434642.0, + "step": 8372 + }, + { + "epoch": 1.0651316626383411, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.3822203874588013, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8723832368850708, + "num_tokens": 319483461.0, + "step": 8373 + }, + { + "epoch": 1.0652588729169317, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5679997205734253, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8726532459259033, + "num_tokens": 319522408.0, + "step": 8374 + }, + { + "epoch": 1.0653860831955222, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.4801546335220337, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8781171441078186, + "num_tokens": 319560572.0, + "step": 8375 + }, + { + "epoch": 1.0655132934741127, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.8856658935546875, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8696624040603638, + "num_tokens": 319595995.0, + "step": 8376 + }, + { + "epoch": 1.0656405037527032, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5593812465667725, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8786723613739014, + "num_tokens": 319634023.0, + "step": 8377 + }, + { + "epoch": 1.0657677140312938, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6106802225112915, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8742235898971558, + "num_tokens": 319668376.0, + "step": 8378 + }, + { + "epoch": 1.0658949243098843, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.4982939958572388, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8662956357002258, + "num_tokens": 319713346.0, + "step": 8379 + }, + { + "epoch": 1.0660221345884748, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.5761913061141968, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8839316368103027, + "num_tokens": 319753128.0, + "step": 8380 + }, + { + "epoch": 1.0661493448670654, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6537694931030273, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8732956647872925, + "num_tokens": 319789592.0, + "step": 8381 + }, + { + "epoch": 1.0662765551456557, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.703049659729004, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8726226091384888, + "num_tokens": 319823200.0, + "step": 8382 + }, + { + "epoch": 1.0664037654242462, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.515803337097168, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.870999813079834, + "num_tokens": 319867290.0, + "step": 8383 + }, + { + "epoch": 1.0665309757028367, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.7383363246917725, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8697025775909424, + "num_tokens": 319906234.0, + "step": 8384 + }, + { + "epoch": 1.0666581859814273, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.8192481994628906, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8488874435424805, + "num_tokens": 319941585.0, + "step": 8385 + }, + { + "epoch": 1.0667853962600178, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.6787773370742798, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8658093214035034, + "num_tokens": 319978241.0, + "step": 8386 + }, + { + "epoch": 1.0669126065386083, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5858851671218872, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8720307350158691, + "num_tokens": 320018648.0, + "step": 8387 + }, + { + "epoch": 1.0670398168171988, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.4890819787979126, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8783466219902039, + "num_tokens": 320059186.0, + "step": 8388 + }, + { + "epoch": 1.0671670270957894, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5468738079071045, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8821402788162231, + "num_tokens": 320095452.0, + "step": 8389 + }, + { + "epoch": 1.06729423737438, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5133002996444702, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8719310164451599, + "num_tokens": 320138307.0, + "step": 8390 + }, + { + "epoch": 1.0674214476529704, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.542733907699585, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.877242386341095, + "num_tokens": 320177773.0, + "step": 8391 + }, + { + "epoch": 1.067548657931561, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5761672258377075, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8695193529129028, + "num_tokens": 320213742.0, + "step": 8392 + }, + { + "epoch": 1.0676758682101515, + "ewc_loss": 2.0265579223632812e-05, + "grad_norm": 1.572726845741272, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8831812143325806, + "num_tokens": 320248895.0, + "step": 8393 + }, + { + "epoch": 1.0678030784887418, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5761693716049194, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8722018003463745, + "num_tokens": 320288601.0, + "step": 8394 + }, + { + "epoch": 1.0679302887673323, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5731422901153564, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8694832921028137, + "num_tokens": 320326638.0, + "step": 8395 + }, + { + "epoch": 1.0680574990459228, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6757322549819946, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8608030080795288, + "num_tokens": 320364802.0, + "step": 8396 + }, + { + "epoch": 1.0681847093245134, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.7441047430038452, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8692108392715454, + "num_tokens": 320401602.0, + "step": 8397 + }, + { + "epoch": 1.068311919603104, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.633508324623108, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8711168169975281, + "num_tokens": 320438651.0, + "step": 8398 + }, + { + "epoch": 1.0684391298816944, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6463017463684082, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8712176084518433, + "num_tokens": 320477104.0, + "step": 8399 + }, + { + "epoch": 1.068566340160285, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.536463975906372, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8767327070236206, + "num_tokens": 320517421.0, + "step": 8400 + }, + { + "epoch": 1.0686935504388755, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5364128351211548, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8755112290382385, + "num_tokens": 320559708.0, + "step": 8401 + }, + { + "epoch": 1.068820760717466, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6187998056411743, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8791140913963318, + "num_tokens": 320594597.0, + "step": 8402 + }, + { + "epoch": 1.0689479709960565, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6930348873138428, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8825253844261169, + "num_tokens": 320632649.0, + "step": 8403 + }, + { + "epoch": 1.069075181274647, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.764004111289978, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8655241131782532, + "num_tokens": 320665699.0, + "step": 8404 + }, + { + "epoch": 1.0692023915532376, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.4893184900283813, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8660941123962402, + "num_tokens": 320706523.0, + "step": 8405 + }, + { + "epoch": 1.0693296018318281, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6914840936660767, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8560104370117188, + "num_tokens": 320741665.0, + "step": 8406 + }, + { + "epoch": 1.0694568121104184, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.716421127319336, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8752993941307068, + "num_tokens": 320774606.0, + "step": 8407 + }, + { + "epoch": 1.069584022389009, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.7935359477996826, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8468030691146851, + "num_tokens": 320811879.0, + "step": 8408 + }, + { + "epoch": 1.0697112326675995, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.8075374364852905, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8610647916793823, + "num_tokens": 320848472.0, + "step": 8409 + }, + { + "epoch": 1.06983844294619, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.8251696825027466, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.851012647151947, + "num_tokens": 320883938.0, + "step": 8410 + }, + { + "epoch": 1.0699656532247805, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6266186237335205, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.878108024597168, + "num_tokens": 320919438.0, + "step": 8411 + }, + { + "epoch": 1.070092863503371, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.559105634689331, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8598136901855469, + "num_tokens": 320961336.0, + "step": 8412 + }, + { + "epoch": 1.0702200737819616, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.7031774520874023, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8684614896774292, + "num_tokens": 320992756.0, + "step": 8413 + }, + { + "epoch": 1.0703472840605521, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.518825888633728, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8676644563674927, + "num_tokens": 321036399.0, + "step": 8414 + }, + { + "epoch": 1.0704744943391427, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.7485829591751099, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8643510341644287, + "num_tokens": 321068568.0, + "step": 8415 + }, + { + "epoch": 1.0706017046177332, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.642364501953125, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8666671514511108, + "num_tokens": 321108295.0, + "step": 8416 + }, + { + "epoch": 1.0707289148963237, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.7432286739349365, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.871248185634613, + "num_tokens": 321143766.0, + "step": 8417 + }, + { + "epoch": 1.070856125174914, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.4492608308792114, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8788174390792847, + "num_tokens": 321186188.0, + "step": 8418 + }, + { + "epoch": 1.0709833354535045, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.7854946851730347, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8758023977279663, + "num_tokens": 321218079.0, + "step": 8419 + }, + { + "epoch": 1.071110545732095, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6349717378616333, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.874366044998169, + "num_tokens": 321254211.0, + "step": 8420 + }, + { + "epoch": 1.0712377560106856, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6314536333084106, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8731703162193298, + "num_tokens": 321291145.0, + "step": 8421 + }, + { + "epoch": 1.0713649662892761, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5726920366287231, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8727096915245056, + "num_tokens": 321326588.0, + "step": 8422 + }, + { + "epoch": 1.0714921765678667, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6672672033309937, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8749104738235474, + "num_tokens": 321363641.0, + "step": 8423 + }, + { + "epoch": 1.0716193868464572, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.83860445022583, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8478754162788391, + "num_tokens": 321394508.0, + "step": 8424 + }, + { + "epoch": 1.0717465971250477, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.6892297267913818, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8556057810783386, + "num_tokens": 321434881.0, + "step": 8425 + }, + { + "epoch": 1.0718738074036382, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.584033727645874, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8814562559127808, + "num_tokens": 321471676.0, + "step": 8426 + }, + { + "epoch": 1.0720010176822288, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.460141897201538, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8876911401748657, + "num_tokens": 321512647.0, + "step": 8427 + }, + { + "epoch": 1.0721282279608193, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.464098572731018, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8901100158691406, + "num_tokens": 321556048.0, + "step": 8428 + }, + { + "epoch": 1.0722554382394098, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.603164553642273, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8734297752380371, + "num_tokens": 321592420.0, + "step": 8429 + }, + { + "epoch": 1.0723826485180004, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5968427658081055, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8774259686470032, + "num_tokens": 321627625.0, + "step": 8430 + }, + { + "epoch": 1.0725098587965907, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.538896918296814, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.865070104598999, + "num_tokens": 321667794.0, + "step": 8431 + }, + { + "epoch": 1.0726370690751812, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5727139711380005, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8736919164657593, + "num_tokens": 321705773.0, + "step": 8432 + }, + { + "epoch": 1.0727642793537717, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.629501461982727, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8836809396743774, + "num_tokens": 321743340.0, + "step": 8433 + }, + { + "epoch": 1.0728914896323622, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5350878238677979, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8762295246124268, + "num_tokens": 321785055.0, + "step": 8434 + }, + { + "epoch": 1.0730186999109528, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.5262072086334229, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.884416401386261, + "num_tokens": 321822470.0, + "step": 8435 + }, + { + "epoch": 1.0731459101895433, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.5589956045150757, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.876410722732544, + "num_tokens": 321861723.0, + "step": 8436 + }, + { + "epoch": 1.0732731204681338, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.516050100326538, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8804872035980225, + "num_tokens": 321902409.0, + "step": 8437 + }, + { + "epoch": 1.0734003307467244, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.575748324394226, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8685541152954102, + "num_tokens": 321941813.0, + "step": 8438 + }, + { + "epoch": 1.073527541025315, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.765580415725708, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8542067408561707, + "num_tokens": 321980747.0, + "step": 8439 + }, + { + "epoch": 1.0736547513039054, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.6265151500701904, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8607925176620483, + "num_tokens": 322021149.0, + "step": 8440 + }, + { + "epoch": 1.073781961582496, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.608593463897705, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8614462614059448, + "num_tokens": 322062709.0, + "step": 8441 + }, + { + "epoch": 1.0739091718610865, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.477786660194397, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8822910785675049, + "num_tokens": 322104823.0, + "step": 8442 + }, + { + "epoch": 1.0740363821396768, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.5905959606170654, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8758617043495178, + "num_tokens": 322140299.0, + "step": 8443 + }, + { + "epoch": 1.0741635924182673, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6733160018920898, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8814165592193604, + "num_tokens": 322178922.0, + "step": 8444 + }, + { + "epoch": 1.0742908026968578, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.724137306213379, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8613779544830322, + "num_tokens": 322218552.0, + "step": 8445 + }, + { + "epoch": 1.0744180129754484, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.851981282234192, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8620078563690186, + "num_tokens": 322250593.0, + "step": 8446 + }, + { + "epoch": 1.074545223254039, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.5485546588897705, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8824199438095093, + "num_tokens": 322289652.0, + "step": 8447 + }, + { + "epoch": 1.0746724335326294, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.497208833694458, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8797372579574585, + "num_tokens": 322330377.0, + "step": 8448 + }, + { + "epoch": 1.07479964381122, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.6311906576156616, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8785173296928406, + "num_tokens": 322364164.0, + "step": 8449 + }, + { + "epoch": 1.0749268540898105, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.6595498323440552, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8634704351425171, + "num_tokens": 322405633.0, + "step": 8450 + }, + { + "epoch": 1.075054064368401, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7548916339874268, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8712445497512817, + "num_tokens": 322439606.0, + "step": 8451 + }, + { + "epoch": 1.0751812746469915, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.8157199621200562, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8707671761512756, + "num_tokens": 322470680.0, + "step": 8452 + }, + { + "epoch": 1.075308484925582, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6445406675338745, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8739724159240723, + "num_tokens": 322503645.0, + "step": 8453 + }, + { + "epoch": 1.0754356952041726, + "ewc_loss": 2.0503997802734375e-05, + "grad_norm": 1.6075036525726318, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8684329390525818, + "num_tokens": 322541531.0, + "step": 8454 + }, + { + "epoch": 1.0755629054827631, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.670479416847229, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8613995313644409, + "num_tokens": 322584067.0, + "step": 8455 + }, + { + "epoch": 1.0756901157613534, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5880062580108643, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8873153924942017, + "num_tokens": 322621291.0, + "step": 8456 + }, + { + "epoch": 1.075817326039944, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7031522989273071, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8660795092582703, + "num_tokens": 322658460.0, + "step": 8457 + }, + { + "epoch": 1.0759445363185345, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.619583010673523, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8647702932357788, + "num_tokens": 322697402.0, + "step": 8458 + }, + { + "epoch": 1.076071746597125, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.660077691078186, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8723123073577881, + "num_tokens": 322733625.0, + "step": 8459 + }, + { + "epoch": 1.0761989568757155, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.405757188796997, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8781009912490845, + "num_tokens": 322779446.0, + "step": 8460 + }, + { + "epoch": 1.076326167154306, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6041139364242554, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8779647946357727, + "num_tokens": 322819460.0, + "step": 8461 + }, + { + "epoch": 1.0764533774328966, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5601435899734497, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8726972937583923, + "num_tokens": 322859195.0, + "step": 8462 + }, + { + "epoch": 1.0765805877114871, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5073673725128174, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8747329711914062, + "num_tokens": 322900624.0, + "step": 8463 + }, + { + "epoch": 1.0767077979900777, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6200779676437378, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8671240210533142, + "num_tokens": 322940690.0, + "step": 8464 + }, + { + "epoch": 1.0768350082686682, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6968613862991333, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8707728981971741, + "num_tokens": 322972659.0, + "step": 8465 + }, + { + "epoch": 1.0769622185472587, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 3.6854026317596436, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.881659209728241, + "num_tokens": 323006943.0, + "step": 8466 + }, + { + "epoch": 1.077089428825849, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.4675148725509644, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8822718262672424, + "num_tokens": 323047415.0, + "step": 8467 + }, + { + "epoch": 1.0772166391044395, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7893091440200806, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8565965890884399, + "num_tokens": 323084940.0, + "step": 8468 + }, + { + "epoch": 1.07734384938303, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 16.704395294189453, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8680380582809448, + "num_tokens": 323122946.0, + "step": 8469 + }, + { + "epoch": 1.0774710596616206, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6304513216018677, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8649334907531738, + "num_tokens": 323161719.0, + "step": 8470 + }, + { + "epoch": 1.0775982699402111, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5581191778182983, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8697589635848999, + "num_tokens": 323202188.0, + "step": 8471 + }, + { + "epoch": 1.0777254802188017, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6554478406906128, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8725258708000183, + "num_tokens": 323237414.0, + "step": 8472 + }, + { + "epoch": 1.0778526904973922, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5592751502990723, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.873832106590271, + "num_tokens": 323275120.0, + "step": 8473 + }, + { + "epoch": 1.0779799007759827, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7015819549560547, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8625445365905762, + "num_tokens": 323313484.0, + "step": 8474 + }, + { + "epoch": 1.0781071110545732, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6295111179351807, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8751406669616699, + "num_tokens": 323348349.0, + "step": 8475 + }, + { + "epoch": 1.0782343213331638, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5541127920150757, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8734149932861328, + "num_tokens": 323387409.0, + "step": 8476 + }, + { + "epoch": 1.0783615316117543, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5607072114944458, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8570482134819031, + "num_tokens": 323428992.0, + "step": 8477 + }, + { + "epoch": 1.0784887418903448, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.581653356552124, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8789879083633423, + "num_tokens": 323465467.0, + "step": 8478 + }, + { + "epoch": 1.0786159521689354, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7293933629989624, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8534058928489685, + "num_tokens": 323501526.0, + "step": 8479 + }, + { + "epoch": 1.0787431624475257, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6349327564239502, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8742752075195312, + "num_tokens": 323540498.0, + "step": 8480 + }, + { + "epoch": 1.0788703727261162, + "ewc_loss": 2.0384788513183594e-05, + "grad_norm": 1.825922966003418, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8460798263549805, + "num_tokens": 323577076.0, + "step": 8481 + }, + { + "epoch": 1.0789975830047067, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6569490432739258, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8640244007110596, + "num_tokens": 323613822.0, + "step": 8482 + }, + { + "epoch": 1.0791247932832972, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6393979787826538, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8663098812103271, + "num_tokens": 323650873.0, + "step": 8483 + }, + { + "epoch": 1.0792520035618878, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.4868987798690796, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8893129825592041, + "num_tokens": 323689970.0, + "step": 8484 + }, + { + "epoch": 1.0793792138404783, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6930493116378784, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8657348155975342, + "num_tokens": 323726168.0, + "step": 8485 + }, + { + "epoch": 1.0795064241190688, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.520267367362976, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8841725587844849, + "num_tokens": 323768106.0, + "step": 8486 + }, + { + "epoch": 1.0796336343976594, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.516899824142456, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8533955812454224, + "num_tokens": 323811064.0, + "step": 8487 + }, + { + "epoch": 1.0797608446762499, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6048551797866821, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8751102685928345, + "num_tokens": 323849986.0, + "step": 8488 + }, + { + "epoch": 1.0798880549548404, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6644753217697144, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8625006675720215, + "num_tokens": 323895033.0, + "step": 8489 + }, + { + "epoch": 1.080015265233431, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.546244502067566, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8721294403076172, + "num_tokens": 323935658.0, + "step": 8490 + }, + { + "epoch": 1.0801424755120215, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.646141767501831, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8666383624076843, + "num_tokens": 323972113.0, + "step": 8491 + }, + { + "epoch": 1.0802696857906118, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5556987524032593, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8731745481491089, + "num_tokens": 324011553.0, + "step": 8492 + }, + { + "epoch": 1.0803968960692023, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7163989543914795, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8563709855079651, + "num_tokens": 324046841.0, + "step": 8493 + }, + { + "epoch": 1.0805241063477928, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6194404363632202, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8695017695426941, + "num_tokens": 324085854.0, + "step": 8494 + }, + { + "epoch": 1.0806513166263834, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.705726981163025, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8602250218391418, + "num_tokens": 324122589.0, + "step": 8495 + }, + { + "epoch": 1.080778526904974, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.75706148147583, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8678680658340454, + "num_tokens": 324158876.0, + "step": 8496 + }, + { + "epoch": 1.0809057371835644, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.521101951599121, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8806491494178772, + "num_tokens": 324198989.0, + "step": 8497 + }, + { + "epoch": 1.081032947462155, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.60035240650177, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8692314624786377, + "num_tokens": 324241193.0, + "step": 8498 + }, + { + "epoch": 1.0811601577407455, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.636146903038025, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.874355137348175, + "num_tokens": 324278945.0, + "step": 8499 + }, + { + "epoch": 1.081287368019336, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5872812271118164, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8609665632247925, + "num_tokens": 324318823.0, + "step": 8500 + }, + { + "epoch": 1.0814145782979265, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5903476476669312, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8665294647216797, + "num_tokens": 324355563.0, + "step": 8501 + }, + { + "epoch": 1.081541788576517, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7326987981796265, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8710381388664246, + "num_tokens": 324387888.0, + "step": 8502 + }, + { + "epoch": 1.0816689988551076, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6788476705551147, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8684231042861938, + "num_tokens": 324425240.0, + "step": 8503 + }, + { + "epoch": 1.0817962091336981, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7622298002243042, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8560326099395752, + "num_tokens": 324460371.0, + "step": 8504 + }, + { + "epoch": 1.0819234194122884, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.538919448852539, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8553944826126099, + "num_tokens": 324500277.0, + "step": 8505 + }, + { + "epoch": 1.082050629690879, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5605424642562866, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8804905414581299, + "num_tokens": 324541013.0, + "step": 8506 + }, + { + "epoch": 1.0821778399694695, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.732969880104065, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8526862859725952, + "num_tokens": 324574967.0, + "step": 8507 + }, + { + "epoch": 1.08230505024806, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6166176795959473, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8786539435386658, + "num_tokens": 324612413.0, + "step": 8508 + }, + { + "epoch": 1.0824322605266505, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7035983800888062, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8676251173019409, + "num_tokens": 324648369.0, + "step": 8509 + }, + { + "epoch": 1.082559470805241, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5205800533294678, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.873870849609375, + "num_tokens": 324689056.0, + "step": 8510 + }, + { + "epoch": 1.0826866810838316, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6749553680419922, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8748325705528259, + "num_tokens": 324722207.0, + "step": 8511 + }, + { + "epoch": 1.0828138913624221, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.564723253250122, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8520390391349792, + "num_tokens": 324769491.0, + "step": 8512 + }, + { + "epoch": 1.0829411016410126, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.4492576122283936, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.881503701210022, + "num_tokens": 324813063.0, + "step": 8513 + }, + { + "epoch": 1.0830683119196032, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.8672581911087036, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8774364590644836, + "num_tokens": 324843860.0, + "step": 8514 + }, + { + "epoch": 1.0831955221981937, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.4965484142303467, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8628268241882324, + "num_tokens": 324886900.0, + "step": 8515 + }, + { + "epoch": 1.083322732476784, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5958644151687622, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8676477074623108, + "num_tokens": 324923970.0, + "step": 8516 + }, + { + "epoch": 1.0834499427553745, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6423572301864624, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8601691722869873, + "num_tokens": 324961880.0, + "step": 8517 + }, + { + "epoch": 1.083577153033965, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6516090631484985, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8682165741920471, + "num_tokens": 325001315.0, + "step": 8518 + }, + { + "epoch": 1.0837043633125556, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.778883695602417, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8689849376678467, + "num_tokens": 325034874.0, + "step": 8519 + }, + { + "epoch": 1.0838315735911461, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5987088680267334, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.863105297088623, + "num_tokens": 325072767.0, + "step": 8520 + }, + { + "epoch": 1.0839587838697367, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5630197525024414, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8833656907081604, + "num_tokens": 325108986.0, + "step": 8521 + }, + { + "epoch": 1.0840859941483272, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5641014575958252, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8857619762420654, + "num_tokens": 325147102.0, + "step": 8522 + }, + { + "epoch": 1.0842132044269177, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6781173944473267, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8593482971191406, + "num_tokens": 325184817.0, + "step": 8523 + }, + { + "epoch": 1.0843404147055082, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.500876545906067, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8830515146255493, + "num_tokens": 325223276.0, + "step": 8524 + }, + { + "epoch": 1.0844676249840988, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7332550287246704, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8757078051567078, + "num_tokens": 325261152.0, + "step": 8525 + }, + { + "epoch": 1.0845948352626893, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.577980637550354, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8792845010757446, + "num_tokens": 325298007.0, + "step": 8526 + }, + { + "epoch": 1.0847220455412798, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7809334993362427, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8712925314903259, + "num_tokens": 325331915.0, + "step": 8527 + }, + { + "epoch": 1.0848492558198704, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.667796015739441, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8694177865982056, + "num_tokens": 325365786.0, + "step": 8528 + }, + { + "epoch": 1.0849764660984607, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7705731391906738, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8827999234199524, + "num_tokens": 325400704.0, + "step": 8529 + }, + { + "epoch": 1.0851036763770512, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.589615821838379, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8812627792358398, + "num_tokens": 325440293.0, + "step": 8530 + }, + { + "epoch": 1.0852308866556417, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5377734899520874, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8760148286819458, + "num_tokens": 325481031.0, + "step": 8531 + }, + { + "epoch": 1.0853580969342322, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.772363305091858, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8658090829849243, + "num_tokens": 325514761.0, + "step": 8532 + }, + { + "epoch": 1.0854853072128228, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7319947481155396, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8560545444488525, + "num_tokens": 325555592.0, + "step": 8533 + }, + { + "epoch": 1.0856125174914133, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.709700345993042, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8681859374046326, + "num_tokens": 325590721.0, + "step": 8534 + }, + { + "epoch": 1.0857397277700038, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.4804835319519043, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8821172118186951, + "num_tokens": 325633622.0, + "step": 8535 + }, + { + "epoch": 1.0858669380485944, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.729203224182129, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8719346523284912, + "num_tokens": 325668660.0, + "step": 8536 + }, + { + "epoch": 1.0859941483271849, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.6041456460952759, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8805803060531616, + "num_tokens": 325705835.0, + "step": 8537 + }, + { + "epoch": 1.0861213586057754, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.5425653457641602, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8688298463821411, + "num_tokens": 325744310.0, + "step": 8538 + }, + { + "epoch": 1.086248568884366, + "ewc_loss": 2.0623207092285156e-05, + "grad_norm": 1.7463489770889282, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8528209328651428, + "num_tokens": 325783249.0, + "step": 8539 + }, + { + "epoch": 1.0863757791629565, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.631745457649231, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8694952726364136, + "num_tokens": 325823786.0, + "step": 8540 + }, + { + "epoch": 1.0865029894415468, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6124049425125122, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8772405385971069, + "num_tokens": 325860042.0, + "step": 8541 + }, + { + "epoch": 1.0866301997201373, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5101450681686401, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8713089227676392, + "num_tokens": 325900822.0, + "step": 8542 + }, + { + "epoch": 1.0867574099987278, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5010082721710205, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8745735287666321, + "num_tokens": 325941601.0, + "step": 8543 + }, + { + "epoch": 1.0868846202773184, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6702073812484741, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8709039688110352, + "num_tokens": 325977189.0, + "step": 8544 + }, + { + "epoch": 1.0870118305559089, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5276355743408203, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8849425315856934, + "num_tokens": 326016083.0, + "step": 8545 + }, + { + "epoch": 1.0871390408344994, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7122410535812378, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8458046317100525, + "num_tokens": 326054768.0, + "step": 8546 + }, + { + "epoch": 1.08726625111309, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5869394540786743, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8519891500473022, + "num_tokens": 326094827.0, + "step": 8547 + }, + { + "epoch": 1.0873934613916805, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6795419454574585, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8666304349899292, + "num_tokens": 326128472.0, + "step": 8548 + }, + { + "epoch": 1.087520671670271, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.643646001815796, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8560602068901062, + "num_tokens": 326168156.0, + "step": 8549 + }, + { + "epoch": 1.0876478819488615, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7349299192428589, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8517159223556519, + "num_tokens": 326207707.0, + "step": 8550 + }, + { + "epoch": 1.087775092227452, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.76741361618042, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8528963327407837, + "num_tokens": 326244956.0, + "step": 8551 + }, + { + "epoch": 1.0879023025060426, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.9199903011322021, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8557876944541931, + "num_tokens": 326275851.0, + "step": 8552 + }, + { + "epoch": 1.0880295127846331, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7415285110473633, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8698105812072754, + "num_tokens": 326314745.0, + "step": 8553 + }, + { + "epoch": 1.0881567230632234, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6069159507751465, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8789440989494324, + "num_tokens": 326357361.0, + "step": 8554 + }, + { + "epoch": 1.088283933341814, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.768439769744873, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8761885166168213, + "num_tokens": 326395060.0, + "step": 8555 + }, + { + "epoch": 1.0884111436204045, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.738182544708252, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8795541524887085, + "num_tokens": 326429678.0, + "step": 8556 + }, + { + "epoch": 1.088538353898995, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.3930774927139282, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8624827861785889, + "num_tokens": 326481634.0, + "step": 8557 + }, + { + "epoch": 1.0886655641775855, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5074303150177002, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8677618503570557, + "num_tokens": 326525837.0, + "step": 8558 + }, + { + "epoch": 1.088792774456176, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7275851964950562, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8424860835075378, + "num_tokens": 326565274.0, + "step": 8559 + }, + { + "epoch": 1.0889199847347666, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6670427322387695, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8630521297454834, + "num_tokens": 326603075.0, + "step": 8560 + }, + { + "epoch": 1.0890471950133571, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5207579135894775, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8843234777450562, + "num_tokens": 326640864.0, + "step": 8561 + }, + { + "epoch": 1.0891744052919476, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6255124807357788, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.863310694694519, + "num_tokens": 326681824.0, + "step": 8562 + }, + { + "epoch": 1.0893016155705382, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.732888102531433, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8718037605285645, + "num_tokens": 326720929.0, + "step": 8563 + }, + { + "epoch": 1.0894288258491287, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5897959470748901, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8758395910263062, + "num_tokens": 326758138.0, + "step": 8564 + }, + { + "epoch": 1.089556036127719, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5855437517166138, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8477597236633301, + "num_tokens": 326803223.0, + "step": 8565 + }, + { + "epoch": 1.0896832464063095, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.58206045627594, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8598365783691406, + "num_tokens": 326847580.0, + "step": 8566 + }, + { + "epoch": 1.0898104566849, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.8164970874786377, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8766305446624756, + "num_tokens": 326881179.0, + "step": 8567 + }, + { + "epoch": 1.0899376669634906, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.594559907913208, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8564238548278809, + "num_tokens": 326919763.0, + "step": 8568 + }, + { + "epoch": 1.0900648772420811, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5739680528640747, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8558380603790283, + "num_tokens": 326962631.0, + "step": 8569 + }, + { + "epoch": 1.0901920875206716, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7421817779541016, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.855345606803894, + "num_tokens": 326998156.0, + "step": 8570 + }, + { + "epoch": 1.0903192977992622, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.627479076385498, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8671096563339233, + "num_tokens": 327036231.0, + "step": 8571 + }, + { + "epoch": 1.0904465080778527, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5549962520599365, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8666855096817017, + "num_tokens": 327076594.0, + "step": 8572 + }, + { + "epoch": 1.0905737183564432, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.525887131690979, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8764408826828003, + "num_tokens": 327121829.0, + "step": 8573 + }, + { + "epoch": 1.0907009286350338, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5704517364501953, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8803591132164001, + "num_tokens": 327157649.0, + "step": 8574 + }, + { + "epoch": 1.0908281389136243, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6635164022445679, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8701491951942444, + "num_tokens": 327192209.0, + "step": 8575 + }, + { + "epoch": 1.0909553491922148, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6967977285385132, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8738461136817932, + "num_tokens": 327230366.0, + "step": 8576 + }, + { + "epoch": 1.0910825594708053, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5829285383224487, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8781852722167969, + "num_tokens": 327266040.0, + "step": 8577 + }, + { + "epoch": 1.0912097697493957, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.491836428642273, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8797568082809448, + "num_tokens": 327303216.0, + "step": 8578 + }, + { + "epoch": 1.0913369800279862, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5468995571136475, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8832447528839111, + "num_tokens": 327339152.0, + "step": 8579 + }, + { + "epoch": 1.0914641903065767, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7089548110961914, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8664066791534424, + "num_tokens": 327371868.0, + "step": 8580 + }, + { + "epoch": 1.0915914005851672, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5832058191299438, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8615177869796753, + "num_tokens": 327415518.0, + "step": 8581 + }, + { + "epoch": 1.0917186108637578, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.4997310638427734, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8745819330215454, + "num_tokens": 327458015.0, + "step": 8582 + }, + { + "epoch": 1.0918458211423483, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6056174039840698, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8783141374588013, + "num_tokens": 327494782.0, + "step": 8583 + }, + { + "epoch": 1.0919730314209388, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6069587469100952, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8627002239227295, + "num_tokens": 327537124.0, + "step": 8584 + }, + { + "epoch": 1.0921002416995294, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6935789585113525, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8783339858055115, + "num_tokens": 327570307.0, + "step": 8585 + }, + { + "epoch": 1.0922274519781199, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6558012962341309, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8718364238739014, + "num_tokens": 327607456.0, + "step": 8586 + }, + { + "epoch": 1.0923546622567104, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6616183519363403, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8752821087837219, + "num_tokens": 327643523.0, + "step": 8587 + }, + { + "epoch": 1.092481872535301, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5201016664505005, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8728777170181274, + "num_tokens": 327684407.0, + "step": 8588 + }, + { + "epoch": 1.0926090828138915, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7092103958129883, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8782151937484741, + "num_tokens": 327721322.0, + "step": 8589 + }, + { + "epoch": 1.0927362930924818, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7355093955993652, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8673028945922852, + "num_tokens": 327755816.0, + "step": 8590 + }, + { + "epoch": 1.0928635033710723, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6723337173461914, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8638291358947754, + "num_tokens": 327792552.0, + "step": 8591 + }, + { + "epoch": 1.0929907136496628, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.624348759651184, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8731487989425659, + "num_tokens": 327828666.0, + "step": 8592 + }, + { + "epoch": 1.0931179239282534, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.7000961303710938, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8594077825546265, + "num_tokens": 327868424.0, + "step": 8593 + }, + { + "epoch": 1.0932451342068439, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6722089052200317, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8739593029022217, + "num_tokens": 327905218.0, + "step": 8594 + }, + { + "epoch": 1.0933723444854344, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.5415682792663574, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8832230567932129, + "num_tokens": 327943397.0, + "step": 8595 + }, + { + "epoch": 1.093499554764025, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6572840213775635, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8707315921783447, + "num_tokens": 327983540.0, + "step": 8596 + }, + { + "epoch": 1.0936267650426155, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.6998990774154663, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8664524555206299, + "num_tokens": 328017879.0, + "step": 8597 + }, + { + "epoch": 1.093753975321206, + "ewc_loss": 2.0742416381835938e-05, + "grad_norm": 1.861190915107727, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8746786117553711, + "num_tokens": 328050273.0, + "step": 8598 + }, + { + "epoch": 1.0938811855997965, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.658693790435791, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8752774000167847, + "num_tokens": 328085456.0, + "step": 8599 + }, + { + "epoch": 1.094008395878387, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.584120512008667, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8790837526321411, + "num_tokens": 328126818.0, + "step": 8600 + }, + { + "epoch": 1.0941356061569776, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7310916185379028, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8656998872756958, + "num_tokens": 328163092.0, + "step": 8601 + }, + { + "epoch": 1.094262816435568, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5928930044174194, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8879953026771545, + "num_tokens": 328200927.0, + "step": 8602 + }, + { + "epoch": 1.0943900267141584, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6721035242080688, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8736037015914917, + "num_tokens": 328235774.0, + "step": 8603 + }, + { + "epoch": 1.094517236992749, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.696028709411621, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8734064698219299, + "num_tokens": 328280268.0, + "step": 8604 + }, + { + "epoch": 1.0946444472713395, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7748264074325562, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8695694208145142, + "num_tokens": 328313667.0, + "step": 8605 + }, + { + "epoch": 1.09477165754993, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.706815481185913, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8723187446594238, + "num_tokens": 328350758.0, + "step": 8606 + }, + { + "epoch": 1.0948988678285205, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6739726066589355, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.872369647026062, + "num_tokens": 328389487.0, + "step": 8607 + }, + { + "epoch": 1.095026078107111, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6125444173812866, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8540763854980469, + "num_tokens": 328428263.0, + "step": 8608 + }, + { + "epoch": 1.0951532883857016, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.772373914718628, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8656585812568665, + "num_tokens": 328463131.0, + "step": 8609 + }, + { + "epoch": 1.0952804986642921, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7109266519546509, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.874129056930542, + "num_tokens": 328498149.0, + "step": 8610 + }, + { + "epoch": 1.0954077089428826, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6823314428329468, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8574197888374329, + "num_tokens": 328537838.0, + "step": 8611 + }, + { + "epoch": 1.0955349192214732, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5472475290298462, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8729947209358215, + "num_tokens": 328576126.0, + "step": 8612 + }, + { + "epoch": 1.0956621295000637, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7522939443588257, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8523200750350952, + "num_tokens": 328612780.0, + "step": 8613 + }, + { + "epoch": 1.095789339778654, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6278053522109985, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8845322132110596, + "num_tokens": 328646093.0, + "step": 8614 + }, + { + "epoch": 1.0959165500572445, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.547284722328186, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.879552960395813, + "num_tokens": 328684211.0, + "step": 8615 + }, + { + "epoch": 1.096043760335835, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5523494482040405, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8746907711029053, + "num_tokens": 328722012.0, + "step": 8616 + }, + { + "epoch": 1.0961709706144256, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.633091926574707, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8884391784667969, + "num_tokens": 328754931.0, + "step": 8617 + }, + { + "epoch": 1.0962981808930161, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.4918488264083862, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8718112111091614, + "num_tokens": 328796194.0, + "step": 8618 + }, + { + "epoch": 1.0964253911716066, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6318111419677734, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8669170141220093, + "num_tokens": 328832073.0, + "step": 8619 + }, + { + "epoch": 1.0965526014501972, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.612573266029358, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8774484395980835, + "num_tokens": 328865928.0, + "step": 8620 + }, + { + "epoch": 1.0966798117287877, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.800657868385315, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8834523558616638, + "num_tokens": 328901596.0, + "step": 8621 + }, + { + "epoch": 1.0968070220073782, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6852595806121826, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8666098713874817, + "num_tokens": 328938624.0, + "step": 8622 + }, + { + "epoch": 1.0969342322859688, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5664231777191162, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8817366361618042, + "num_tokens": 328974371.0, + "step": 8623 + }, + { + "epoch": 1.0970614425645593, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7078242301940918, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8535143136978149, + "num_tokens": 329014333.0, + "step": 8624 + }, + { + "epoch": 1.0971886528431498, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5696834325790405, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8820525407791138, + "num_tokens": 329058018.0, + "step": 8625 + }, + { + "epoch": 1.0973158631217403, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6611864566802979, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8742310404777527, + "num_tokens": 329094204.0, + "step": 8626 + }, + { + "epoch": 1.0974430734003306, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7354520559310913, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8747684955596924, + "num_tokens": 329125638.0, + "step": 8627 + }, + { + "epoch": 1.0975702836789212, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7021387815475464, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.845585823059082, + "num_tokens": 329163228.0, + "step": 8628 + }, + { + "epoch": 1.0976974939575117, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.679425835609436, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8880362510681152, + "num_tokens": 329199088.0, + "step": 8629 + }, + { + "epoch": 1.0978247042361022, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5796009302139282, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8662399053573608, + "num_tokens": 329240021.0, + "step": 8630 + }, + { + "epoch": 1.0979519145146928, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7609009742736816, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8721004724502563, + "num_tokens": 329273407.0, + "step": 8631 + }, + { + "epoch": 1.0980791247932833, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.574519395828247, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8756933808326721, + "num_tokens": 329312258.0, + "step": 8632 + }, + { + "epoch": 1.0982063350718738, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5314782857894897, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8624575138092041, + "num_tokens": 329354670.0, + "step": 8633 + }, + { + "epoch": 1.0983335453504643, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7016239166259766, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8720768690109253, + "num_tokens": 329389910.0, + "step": 8634 + }, + { + "epoch": 1.0984607556290549, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6965235471725464, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8568278551101685, + "num_tokens": 329425112.0, + "step": 8635 + }, + { + "epoch": 1.0985879659076454, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6045074462890625, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8644267320632935, + "num_tokens": 329463202.0, + "step": 8636 + }, + { + "epoch": 1.098715176186236, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5434391498565674, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8829838037490845, + "num_tokens": 329501737.0, + "step": 8637 + }, + { + "epoch": 1.0988423864648265, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5910652875900269, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8692540526390076, + "num_tokens": 329545854.0, + "step": 8638 + }, + { + "epoch": 1.0989695967434168, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5143946409225464, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8570715188980103, + "num_tokens": 329594404.0, + "step": 8639 + }, + { + "epoch": 1.0990968070220073, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.4621869325637817, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8871101140975952, + "num_tokens": 329636214.0, + "step": 8640 + }, + { + "epoch": 1.0992240173005978, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.4968547821044922, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8705416917800903, + "num_tokens": 329677164.0, + "step": 8641 + }, + { + "epoch": 1.0993512275791884, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5005478858947754, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8801593780517578, + "num_tokens": 329717467.0, + "step": 8642 + }, + { + "epoch": 1.0994784378577789, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.763552188873291, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.87336665391922, + "num_tokens": 329751542.0, + "step": 8643 + }, + { + "epoch": 1.0996056481363694, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5469661951065063, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8771734237670898, + "num_tokens": 329788350.0, + "step": 8644 + }, + { + "epoch": 1.09973285841496, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5917909145355225, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8620582222938538, + "num_tokens": 329830760.0, + "step": 8645 + }, + { + "epoch": 1.0998600686935505, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7621232271194458, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8800594806671143, + "num_tokens": 329864045.0, + "step": 8646 + }, + { + "epoch": 1.099987278972141, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.654582142829895, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8508709669113159, + "num_tokens": 329903809.0, + "step": 8647 + }, + { + "epoch": 1.1001144892507315, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6562100648880005, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8852064609527588, + "num_tokens": 329936749.0, + "step": 8648 + }, + { + "epoch": 1.100241699529322, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7746399641036987, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8658684492111206, + "num_tokens": 329970810.0, + "step": 8649 + }, + { + "epoch": 1.1003689098079126, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7331950664520264, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8696582317352295, + "num_tokens": 330007162.0, + "step": 8650 + }, + { + "epoch": 1.100496120086503, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6308846473693848, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8674942255020142, + "num_tokens": 330045015.0, + "step": 8651 + }, + { + "epoch": 1.1006233303650934, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7268259525299072, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8787429332733154, + "num_tokens": 330077195.0, + "step": 8652 + }, + { + "epoch": 1.100750540643684, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6280004978179932, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8658118844032288, + "num_tokens": 330116453.0, + "step": 8653 + }, + { + "epoch": 1.1008777509222745, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6218210458755493, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8641011118888855, + "num_tokens": 330154920.0, + "step": 8654 + }, + { + "epoch": 1.101004961200865, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6183573007583618, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8576763272285461, + "num_tokens": 330197525.0, + "step": 8655 + }, + { + "epoch": 1.1011321714794555, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.619979977607727, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.86864173412323, + "num_tokens": 330233537.0, + "step": 8656 + }, + { + "epoch": 1.101259381758046, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.4763380289077759, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8711748123168945, + "num_tokens": 330277521.0, + "step": 8657 + }, + { + "epoch": 1.1013865920366366, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.85433828830719, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8650317788124084, + "num_tokens": 330316752.0, + "step": 8658 + }, + { + "epoch": 1.101513802315227, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6491224765777588, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8738129734992981, + "num_tokens": 330352219.0, + "step": 8659 + }, + { + "epoch": 1.1016410125938176, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6116328239440918, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8682605028152466, + "num_tokens": 330388993.0, + "step": 8660 + }, + { + "epoch": 1.1017682228724082, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.5606763362884521, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8671476244926453, + "num_tokens": 330431002.0, + "step": 8661 + }, + { + "epoch": 1.1018954331509987, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6946125030517578, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8713387250900269, + "num_tokens": 330466500.0, + "step": 8662 + }, + { + "epoch": 1.102022643429589, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5228928327560425, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8714871406555176, + "num_tokens": 330504852.0, + "step": 8663 + }, + { + "epoch": 1.1021498537081795, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7221667766571045, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8585410118103027, + "num_tokens": 330540281.0, + "step": 8664 + }, + { + "epoch": 1.10227706398677, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6027356386184692, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8594201803207397, + "num_tokens": 330581022.0, + "step": 8665 + }, + { + "epoch": 1.1024042742653606, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6170542240142822, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8814972043037415, + "num_tokens": 330617047.0, + "step": 8666 + }, + { + "epoch": 1.1025314845439511, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.743401050567627, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8654372692108154, + "num_tokens": 330650552.0, + "step": 8667 + }, + { + "epoch": 1.1026586948225416, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7644668817520142, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8834414482116699, + "num_tokens": 330684547.0, + "step": 8668 + }, + { + "epoch": 1.1027859051011322, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.157701015472412, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8790636658668518, + "num_tokens": 330728757.0, + "step": 8669 + }, + { + "epoch": 1.1029131153797227, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6869598627090454, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8919487595558167, + "num_tokens": 330765082.0, + "step": 8670 + }, + { + "epoch": 1.1030403256583132, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.621760368347168, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.863495409488678, + "num_tokens": 330804055.0, + "step": 8671 + }, + { + "epoch": 1.1031675359369038, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6458942890167236, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8665299415588379, + "num_tokens": 330840882.0, + "step": 8672 + }, + { + "epoch": 1.1032947462154943, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6412365436553955, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.875764012336731, + "num_tokens": 330880451.0, + "step": 8673 + }, + { + "epoch": 1.1034219564940848, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6629376411437988, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8667030334472656, + "num_tokens": 330917435.0, + "step": 8674 + }, + { + "epoch": 1.1035491667726753, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.585120677947998, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8762418031692505, + "num_tokens": 330957411.0, + "step": 8675 + }, + { + "epoch": 1.1036763770512656, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.762081265449524, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8722546100616455, + "num_tokens": 330988039.0, + "step": 8676 + }, + { + "epoch": 1.1038035873298562, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6565873622894287, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8670270442962646, + "num_tokens": 331025170.0, + "step": 8677 + }, + { + "epoch": 1.1039307976084467, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.4937299489974976, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8697683811187744, + "num_tokens": 331068132.0, + "step": 8678 + }, + { + "epoch": 1.1040580078870372, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.6000288724899292, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.869609534740448, + "num_tokens": 331105540.0, + "step": 8679 + }, + { + "epoch": 1.1041852181656278, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.8116708993911743, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8703941702842712, + "num_tokens": 331139545.0, + "step": 8680 + }, + { + "epoch": 1.1043124284442183, + "ewc_loss": 2.086162567138672e-05, + "grad_norm": 1.7132327556610107, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8563526272773743, + "num_tokens": 331176610.0, + "step": 8681 + }, + { + "epoch": 1.1044396387228088, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7907418012619019, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8759195804595947, + "num_tokens": 331213005.0, + "step": 8682 + }, + { + "epoch": 1.1045668490013993, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6492540836334229, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8621348738670349, + "num_tokens": 331255284.0, + "step": 8683 + }, + { + "epoch": 1.1046940592799899, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 2.2551233768463135, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8627768754959106, + "num_tokens": 331292596.0, + "step": 8684 + }, + { + "epoch": 1.1048212695585804, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5893559455871582, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8668348789215088, + "num_tokens": 331331773.0, + "step": 8685 + }, + { + "epoch": 1.104948479837171, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7394262552261353, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8722156286239624, + "num_tokens": 331366541.0, + "step": 8686 + }, + { + "epoch": 1.1050756901157615, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5008426904678345, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8778676986694336, + "num_tokens": 331408084.0, + "step": 8687 + }, + { + "epoch": 1.1052029003943518, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5877171754837036, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8664500713348389, + "num_tokens": 331444805.0, + "step": 8688 + }, + { + "epoch": 1.1053301106729423, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5526156425476074, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8739599585533142, + "num_tokens": 331480999.0, + "step": 8689 + }, + { + "epoch": 1.1054573209515328, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6802036762237549, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8573148250579834, + "num_tokens": 331516185.0, + "step": 8690 + }, + { + "epoch": 1.1055845312301233, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.609979271888733, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8695082664489746, + "num_tokens": 331553420.0, + "step": 8691 + }, + { + "epoch": 1.1057117415087139, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6851757764816284, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8751027584075928, + "num_tokens": 331585151.0, + "step": 8692 + }, + { + "epoch": 1.1058389517873044, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5295857191085815, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.866309404373169, + "num_tokens": 331628742.0, + "step": 8693 + }, + { + "epoch": 1.105966162065895, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6431416273117065, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8658127784729004, + "num_tokens": 331666660.0, + "step": 8694 + }, + { + "epoch": 1.1060933723444855, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6104305982589722, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8741276264190674, + "num_tokens": 331709759.0, + "step": 8695 + }, + { + "epoch": 1.106220582623076, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7296035289764404, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8783231377601624, + "num_tokens": 331743947.0, + "step": 8696 + }, + { + "epoch": 1.1063477929016665, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5799802541732788, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8763552904129028, + "num_tokens": 331780379.0, + "step": 8697 + }, + { + "epoch": 1.106475003180257, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5581005811691284, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8672429323196411, + "num_tokens": 331823250.0, + "step": 8698 + }, + { + "epoch": 1.1066022134588476, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6887849569320679, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8846085071563721, + "num_tokens": 331858047.0, + "step": 8699 + }, + { + "epoch": 1.106729423737438, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7042028903961182, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8563865423202515, + "num_tokens": 331897236.0, + "step": 8700 + }, + { + "epoch": 1.1068566340160284, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6336506605148315, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8539723753929138, + "num_tokens": 331936293.0, + "step": 8701 + }, + { + "epoch": 1.106983844294619, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.549142599105835, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8788434863090515, + "num_tokens": 331976356.0, + "step": 8702 + }, + { + "epoch": 1.1071110545732095, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.4837268590927124, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8806570172309875, + "num_tokens": 332016875.0, + "step": 8703 + }, + { + "epoch": 1.1072382648518, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.8726600408554077, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8735107779502869, + "num_tokens": 332051863.0, + "step": 8704 + }, + { + "epoch": 1.1073654751303905, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.628291130065918, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8728687167167664, + "num_tokens": 332088431.0, + "step": 8705 + }, + { + "epoch": 1.107492685408981, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6134029626846313, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8753260374069214, + "num_tokens": 332124861.0, + "step": 8706 + }, + { + "epoch": 1.1076198956875716, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5629291534423828, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8773075342178345, + "num_tokens": 332165629.0, + "step": 8707 + }, + { + "epoch": 1.107747105966162, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.9871450662612915, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8604185581207275, + "num_tokens": 332196761.0, + "step": 8708 + }, + { + "epoch": 1.1078743162447526, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7246428728103638, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8826307654380798, + "num_tokens": 332232964.0, + "step": 8709 + }, + { + "epoch": 1.1080015265233432, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7379045486450195, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8766151666641235, + "num_tokens": 332266410.0, + "step": 8710 + }, + { + "epoch": 1.1081287368019337, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6439601182937622, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8915399312973022, + "num_tokens": 332301881.0, + "step": 8711 + }, + { + "epoch": 1.108255947080524, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.626761555671692, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8690546154975891, + "num_tokens": 332343281.0, + "step": 8712 + }, + { + "epoch": 1.1083831573591145, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6589164733886719, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8800051212310791, + "num_tokens": 332380584.0, + "step": 8713 + }, + { + "epoch": 1.108510367637705, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7209988832473755, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8697161674499512, + "num_tokens": 332417558.0, + "step": 8714 + }, + { + "epoch": 1.1086375779162956, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.676396131515503, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8719434142112732, + "num_tokens": 332453057.0, + "step": 8715 + }, + { + "epoch": 1.108764788194886, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.522971272468567, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8744673728942871, + "num_tokens": 332499230.0, + "step": 8716 + }, + { + "epoch": 1.1088919984734766, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6574680805206299, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8757072687149048, + "num_tokens": 332534378.0, + "step": 8717 + }, + { + "epoch": 1.1090192087520672, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.653976559638977, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8749996423721313, + "num_tokens": 332571796.0, + "step": 8718 + }, + { + "epoch": 1.1091464190306577, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.777159333229065, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8594445586204529, + "num_tokens": 332609677.0, + "step": 8719 + }, + { + "epoch": 1.1092736293092482, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6592775583267212, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8725141882896423, + "num_tokens": 332643682.0, + "step": 8720 + }, + { + "epoch": 1.1094008395878387, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.690233588218689, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8775322437286377, + "num_tokens": 332681241.0, + "step": 8721 + }, + { + "epoch": 1.1095280498664293, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5288811922073364, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8706011772155762, + "num_tokens": 332719906.0, + "step": 8722 + }, + { + "epoch": 1.1096552601450198, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6385341882705688, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8813678026199341, + "num_tokens": 332759654.0, + "step": 8723 + }, + { + "epoch": 1.1097824704236103, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5260415077209473, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8797441720962524, + "num_tokens": 332803099.0, + "step": 8724 + }, + { + "epoch": 1.1099096807022006, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7545101642608643, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.877493679523468, + "num_tokens": 332833892.0, + "step": 8725 + }, + { + "epoch": 1.1100368909807912, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.53045654296875, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8631186485290527, + "num_tokens": 332876963.0, + "step": 8726 + }, + { + "epoch": 1.1101641012593817, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6446552276611328, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8529286980628967, + "num_tokens": 332917007.0, + "step": 8727 + }, + { + "epoch": 1.1102913115379722, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7331382036209106, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8625000715255737, + "num_tokens": 332952445.0, + "step": 8728 + }, + { + "epoch": 1.1104185218165628, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7272300720214844, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8620105981826782, + "num_tokens": 332988758.0, + "step": 8729 + }, + { + "epoch": 1.1105457320951533, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.726019024848938, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8496187925338745, + "num_tokens": 333025044.0, + "step": 8730 + }, + { + "epoch": 1.1106729423737438, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7058390378952026, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.859218180179596, + "num_tokens": 333065836.0, + "step": 8731 + }, + { + "epoch": 1.1108001526523343, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7902885675430298, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8528488278388977, + "num_tokens": 333099316.0, + "step": 8732 + }, + { + "epoch": 1.1109273629309249, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6298497915267944, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8713366985321045, + "num_tokens": 333137506.0, + "step": 8733 + }, + { + "epoch": 1.1110545732095154, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5427218675613403, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.872778058052063, + "num_tokens": 333179171.0, + "step": 8734 + }, + { + "epoch": 1.111181783488106, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6853998899459839, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8786791563034058, + "num_tokens": 333215116.0, + "step": 8735 + }, + { + "epoch": 1.1113089937666965, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5223090648651123, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8626693487167358, + "num_tokens": 333260716.0, + "step": 8736 + }, + { + "epoch": 1.1114362040452868, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6410719156265259, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8702842593193054, + "num_tokens": 333297923.0, + "step": 8737 + }, + { + "epoch": 1.1115634143238773, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6861060857772827, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8752419948577881, + "num_tokens": 333338239.0, + "step": 8738 + }, + { + "epoch": 1.1116906246024678, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7964801788330078, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8843066096305847, + "num_tokens": 333366885.0, + "step": 8739 + }, + { + "epoch": 1.1118178348810583, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5731425285339355, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8681787848472595, + "num_tokens": 333408040.0, + "step": 8740 + }, + { + "epoch": 1.1119450451596489, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.703925371170044, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8839030861854553, + "num_tokens": 333440950.0, + "step": 8741 + }, + { + "epoch": 1.1120722554382394, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.520796537399292, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8808664083480835, + "num_tokens": 333482307.0, + "step": 8742 + }, + { + "epoch": 1.11219946571683, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.8981598615646362, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8498988151550293, + "num_tokens": 333516312.0, + "step": 8743 + }, + { + "epoch": 1.1123266759954205, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.705780029296875, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8642458319664001, + "num_tokens": 333550783.0, + "step": 8744 + }, + { + "epoch": 1.112453886274011, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5260719060897827, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8731017112731934, + "num_tokens": 333593466.0, + "step": 8745 + }, + { + "epoch": 1.1125810965526015, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.702730417251587, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.866492748260498, + "num_tokens": 333630663.0, + "step": 8746 + }, + { + "epoch": 1.112708306831192, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.693943738937378, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8757660388946533, + "num_tokens": 333661225.0, + "step": 8747 + }, + { + "epoch": 1.1128355171097826, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5920127630233765, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8728312253952026, + "num_tokens": 333697654.0, + "step": 8748 + }, + { + "epoch": 1.112962727388373, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6634111404418945, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8597100973129272, + "num_tokens": 333739959.0, + "step": 8749 + }, + { + "epoch": 1.1130899376669634, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6417062282562256, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8737293481826782, + "num_tokens": 333776603.0, + "step": 8750 + }, + { + "epoch": 1.113217147945554, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.655348300933838, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.866270124912262, + "num_tokens": 333815175.0, + "step": 8751 + }, + { + "epoch": 1.1133443582241445, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.8804763555526733, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8724502921104431, + "num_tokens": 333855016.0, + "step": 8752 + }, + { + "epoch": 1.113471568502735, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5622109174728394, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.875898003578186, + "num_tokens": 333894051.0, + "step": 8753 + }, + { + "epoch": 1.1135987787813255, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7528586387634277, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8762562274932861, + "num_tokens": 333928164.0, + "step": 8754 + }, + { + "epoch": 1.113725989059916, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5608199834823608, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8638914823532104, + "num_tokens": 333969966.0, + "step": 8755 + }, + { + "epoch": 1.1138531993385066, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6320871114730835, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8601990938186646, + "num_tokens": 334008949.0, + "step": 8756 + }, + { + "epoch": 1.113980409617097, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.579795002937317, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.872509241104126, + "num_tokens": 334052662.0, + "step": 8757 + }, + { + "epoch": 1.1141076198956876, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6417665481567383, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8799141049385071, + "num_tokens": 334087107.0, + "step": 8758 + }, + { + "epoch": 1.1142348301742782, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7036511898040771, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.875749945640564, + "num_tokens": 334120428.0, + "step": 8759 + }, + { + "epoch": 1.1143620404528687, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6542311906814575, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8759974241256714, + "num_tokens": 334159818.0, + "step": 8760 + }, + { + "epoch": 1.114489250731459, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5747298002243042, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8641889095306396, + "num_tokens": 334200115.0, + "step": 8761 + }, + { + "epoch": 1.1146164610100495, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6209925413131714, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8627632856369019, + "num_tokens": 334239193.0, + "step": 8762 + }, + { + "epoch": 1.11474367128864, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7990357875823975, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.851344645023346, + "num_tokens": 334277587.0, + "step": 8763 + }, + { + "epoch": 1.1148708815672306, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6234592199325562, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8688583374023438, + "num_tokens": 334314247.0, + "step": 8764 + }, + { + "epoch": 1.114998091845821, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.698834776878357, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8645492792129517, + "num_tokens": 334349605.0, + "step": 8765 + }, + { + "epoch": 1.1151253021244116, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7707881927490234, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8695979714393616, + "num_tokens": 334380462.0, + "step": 8766 + }, + { + "epoch": 1.1152525124030022, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7273664474487305, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.858412504196167, + "num_tokens": 334416317.0, + "step": 8767 + }, + { + "epoch": 1.1153797226815927, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.4670292139053345, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8716179132461548, + "num_tokens": 334459185.0, + "step": 8768 + }, + { + "epoch": 1.1155069329601832, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6013586521148682, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8690446019172668, + "num_tokens": 334497616.0, + "step": 8769 + }, + { + "epoch": 1.1156341432387737, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.602042317390442, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8627290725708008, + "num_tokens": 334534796.0, + "step": 8770 + }, + { + "epoch": 1.1157613535173643, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6974446773529053, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8650081157684326, + "num_tokens": 334570204.0, + "step": 8771 + }, + { + "epoch": 1.1158885637959548, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6413403749465942, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8515040278434753, + "num_tokens": 334611966.0, + "step": 8772 + }, + { + "epoch": 1.1160157740745453, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5511012077331543, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8797446489334106, + "num_tokens": 334654483.0, + "step": 8773 + }, + { + "epoch": 1.1161429843531356, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6984292268753052, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8786402940750122, + "num_tokens": 334687521.0, + "step": 8774 + }, + { + "epoch": 1.1162701946317262, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.7433319091796875, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.865723192691803, + "num_tokens": 334723256.0, + "step": 8775 + }, + { + "epoch": 1.1163974049103167, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.5444599390029907, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8779231309890747, + "num_tokens": 334762355.0, + "step": 8776 + }, + { + "epoch": 1.1165246151889072, + "ewc_loss": 2.09808349609375e-05, + "grad_norm": 1.6973061561584473, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.868220865726471, + "num_tokens": 334799245.0, + "step": 8777 + }, + { + "epoch": 1.1166518254674977, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 2.1688621044158936, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8730542659759521, + "num_tokens": 334830953.0, + "step": 8778 + }, + { + "epoch": 1.1167790357460883, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.9408719539642334, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8676040768623352, + "num_tokens": 334863772.0, + "step": 8779 + }, + { + "epoch": 1.1169062460246788, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6013998985290527, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.862625002861023, + "num_tokens": 334907194.0, + "step": 8780 + }, + { + "epoch": 1.1170334563032693, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.785754919052124, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.859602689743042, + "num_tokens": 334943637.0, + "step": 8781 + }, + { + "epoch": 1.1171606665818599, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6664843559265137, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8640847206115723, + "num_tokens": 334982213.0, + "step": 8782 + }, + { + "epoch": 1.1172878768604504, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7360507249832153, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8792890310287476, + "num_tokens": 335019636.0, + "step": 8783 + }, + { + "epoch": 1.117415087139041, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5514442920684814, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8710389733314514, + "num_tokens": 335059904.0, + "step": 8784 + }, + { + "epoch": 1.1175422974176314, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6366117000579834, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8812625408172607, + "num_tokens": 335095554.0, + "step": 8785 + }, + { + "epoch": 1.1176695076962218, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6530416011810303, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8675503730773926, + "num_tokens": 335138469.0, + "step": 8786 + }, + { + "epoch": 1.1177967179748123, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6412800550460815, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8739117383956909, + "num_tokens": 335176790.0, + "step": 8787 + }, + { + "epoch": 1.1179239282534028, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5398012399673462, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8827927112579346, + "num_tokens": 335216766.0, + "step": 8788 + }, + { + "epoch": 1.1180511385319933, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6086598634719849, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.878558874130249, + "num_tokens": 335254631.0, + "step": 8789 + }, + { + "epoch": 1.1181783488105839, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6035469770431519, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8742687702178955, + "num_tokens": 335295763.0, + "step": 8790 + }, + { + "epoch": 1.1183055590891744, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7012656927108765, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8644639253616333, + "num_tokens": 335331327.0, + "step": 8791 + }, + { + "epoch": 1.118432769367765, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6320537328720093, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8802475929260254, + "num_tokens": 335366430.0, + "step": 8792 + }, + { + "epoch": 1.1185599796463555, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.667523980140686, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.874355673789978, + "num_tokens": 335402633.0, + "step": 8793 + }, + { + "epoch": 1.118687189924946, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.56881582736969, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.864912748336792, + "num_tokens": 335443495.0, + "step": 8794 + }, + { + "epoch": 1.1188144002035365, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.8399626016616821, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8792847394943237, + "num_tokens": 335477558.0, + "step": 8795 + }, + { + "epoch": 1.118941610482127, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7801028490066528, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8667551279067993, + "num_tokens": 335511282.0, + "step": 8796 + }, + { + "epoch": 1.1190688207607176, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.717974305152893, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8552682399749756, + "num_tokens": 335546617.0, + "step": 8797 + }, + { + "epoch": 1.119196031039308, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6260647773742676, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8728616237640381, + "num_tokens": 335587102.0, + "step": 8798 + }, + { + "epoch": 1.1193232413178984, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5928354263305664, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8731001615524292, + "num_tokens": 335626680.0, + "step": 8799 + }, + { + "epoch": 1.119450451596489, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7985379695892334, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8566839694976807, + "num_tokens": 335661528.0, + "step": 8800 + }, + { + "epoch": 1.1195776618750795, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.790531039237976, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8603723049163818, + "num_tokens": 335696834.0, + "step": 8801 + }, + { + "epoch": 1.11970487215367, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7166662216186523, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8653640747070312, + "num_tokens": 335735245.0, + "step": 8802 + }, + { + "epoch": 1.1198320824322605, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.728475570678711, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8601553440093994, + "num_tokens": 335773355.0, + "step": 8803 + }, + { + "epoch": 1.119959292710851, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6070550680160522, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8654918074607849, + "num_tokens": 335811667.0, + "step": 8804 + }, + { + "epoch": 1.1200865029894416, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.592186450958252, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8457969427108765, + "num_tokens": 335856233.0, + "step": 8805 + }, + { + "epoch": 1.120213713268032, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6059784889221191, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8766302466392517, + "num_tokens": 335891500.0, + "step": 8806 + }, + { + "epoch": 1.1203409235466226, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5991876125335693, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.871852457523346, + "num_tokens": 335928204.0, + "step": 8807 + }, + { + "epoch": 1.1204681338252132, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.555436611175537, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8576509952545166, + "num_tokens": 335972065.0, + "step": 8808 + }, + { + "epoch": 1.1205953441038037, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6875115633010864, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8679439425468445, + "num_tokens": 336009299.0, + "step": 8809 + }, + { + "epoch": 1.120722554382394, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6157253980636597, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8775444030761719, + "num_tokens": 336047305.0, + "step": 8810 + }, + { + "epoch": 1.1208497646609845, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6228734254837036, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8677257895469666, + "num_tokens": 336085754.0, + "step": 8811 + }, + { + "epoch": 1.120976974939575, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5131232738494873, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8726058006286621, + "num_tokens": 336127001.0, + "step": 8812 + }, + { + "epoch": 1.1211041852181656, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.717594861984253, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8665508031845093, + "num_tokens": 336159851.0, + "step": 8813 + }, + { + "epoch": 1.121231395496756, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.63015615940094, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8592477440834045, + "num_tokens": 336202584.0, + "step": 8814 + }, + { + "epoch": 1.1213586057753466, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5696260929107666, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8787273168563843, + "num_tokens": 336244233.0, + "step": 8815 + }, + { + "epoch": 1.1214858160539372, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6678268909454346, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8745517730712891, + "num_tokens": 336279355.0, + "step": 8816 + }, + { + "epoch": 1.1216130263325277, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6873849630355835, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8557318449020386, + "num_tokens": 336317525.0, + "step": 8817 + }, + { + "epoch": 1.1217402366111182, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5998177528381348, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8812992572784424, + "num_tokens": 336353217.0, + "step": 8818 + }, + { + "epoch": 1.1218674468897087, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6392109394073486, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8729733228683472, + "num_tokens": 336392708.0, + "step": 8819 + }, + { + "epoch": 1.1219946571682993, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6115573644638062, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8665426969528198, + "num_tokens": 336431739.0, + "step": 8820 + }, + { + "epoch": 1.1221218674468898, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5415618419647217, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.872410774230957, + "num_tokens": 336471003.0, + "step": 8821 + }, + { + "epoch": 1.1222490777254803, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.560319423675537, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8543259501457214, + "num_tokens": 336512781.0, + "step": 8822 + }, + { + "epoch": 1.1223762880040706, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4954811334609985, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8755303025245667, + "num_tokens": 336556132.0, + "step": 8823 + }, + { + "epoch": 1.1225034982826612, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.566909670829773, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8598281145095825, + "num_tokens": 336597987.0, + "step": 8824 + }, + { + "epoch": 1.1226307085612517, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5792545080184937, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8786817789077759, + "num_tokens": 336639389.0, + "step": 8825 + }, + { + "epoch": 1.1227579188398422, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6177586317062378, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8688024282455444, + "num_tokens": 336673366.0, + "step": 8826 + }, + { + "epoch": 1.1228851291184327, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5456188917160034, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8887733221054077, + "num_tokens": 336711614.0, + "step": 8827 + }, + { + "epoch": 1.1230123393970233, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7536616325378418, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.871878981590271, + "num_tokens": 336749536.0, + "step": 8828 + }, + { + "epoch": 1.1231395496756138, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.8160570859909058, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8595513105392456, + "num_tokens": 336780989.0, + "step": 8829 + }, + { + "epoch": 1.1232667599542043, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6930713653564453, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8728768825531006, + "num_tokens": 336818437.0, + "step": 8830 + }, + { + "epoch": 1.1233939702327949, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6302436590194702, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.875785231590271, + "num_tokens": 336861923.0, + "step": 8831 + }, + { + "epoch": 1.1235211805113854, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6911635398864746, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.874973714351654, + "num_tokens": 336898766.0, + "step": 8832 + }, + { + "epoch": 1.123648390789976, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.8438453674316406, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8825228214263916, + "num_tokens": 336929787.0, + "step": 8833 + }, + { + "epoch": 1.1237756010685664, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7182786464691162, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8762387633323669, + "num_tokens": 336962677.0, + "step": 8834 + }, + { + "epoch": 1.1239028113471567, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.645618200302124, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8831983804702759, + "num_tokens": 336994346.0, + "step": 8835 + }, + { + "epoch": 1.1240300216257473, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.585334062576294, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8565087914466858, + "num_tokens": 337037311.0, + "step": 8836 + }, + { + "epoch": 1.1241572319043378, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6246941089630127, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.864945650100708, + "num_tokens": 337075964.0, + "step": 8837 + }, + { + "epoch": 1.1242844421829283, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.663125991821289, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8770467638969421, + "num_tokens": 337111687.0, + "step": 8838 + }, + { + "epoch": 1.1244116524615189, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6029932498931885, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8577086329460144, + "num_tokens": 337151034.0, + "step": 8839 + }, + { + "epoch": 1.1245388627401094, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7020065784454346, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.848899781703949, + "num_tokens": 337185981.0, + "step": 8840 + }, + { + "epoch": 1.1246660730187, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5707650184631348, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.88495934009552, + "num_tokens": 337222496.0, + "step": 8841 + }, + { + "epoch": 1.1247932832972904, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6367632150650024, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8712047338485718, + "num_tokens": 337261082.0, + "step": 8842 + }, + { + "epoch": 1.124920493575881, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6587821245193481, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8792163729667664, + "num_tokens": 337295623.0, + "step": 8843 + }, + { + "epoch": 1.1250477038544715, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6727440357208252, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8706109523773193, + "num_tokens": 337332212.0, + "step": 8844 + }, + { + "epoch": 1.125174914133062, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6737953424453735, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.870949387550354, + "num_tokens": 337368539.0, + "step": 8845 + }, + { + "epoch": 1.1253021244116526, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5208802223205566, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8602973818778992, + "num_tokens": 337415307.0, + "step": 8846 + }, + { + "epoch": 1.125429334690243, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5110081434249878, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8812703490257263, + "num_tokens": 337455390.0, + "step": 8847 + }, + { + "epoch": 1.1255565449688334, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6548073291778564, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.874695897102356, + "num_tokens": 337492554.0, + "step": 8848 + }, + { + "epoch": 1.125683755247424, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7296701669692993, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.853010892868042, + "num_tokens": 337526425.0, + "step": 8849 + }, + { + "epoch": 1.1258109655260145, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.8078442811965942, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8681119084358215, + "num_tokens": 337562001.0, + "step": 8850 + }, + { + "epoch": 1.125938175804605, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6452511548995972, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8684113025665283, + "num_tokens": 337601613.0, + "step": 8851 + }, + { + "epoch": 1.1260653860831955, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5295660495758057, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.875791072845459, + "num_tokens": 337641875.0, + "step": 8852 + }, + { + "epoch": 1.126192596361786, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.674872875213623, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8586483001708984, + "num_tokens": 337679406.0, + "step": 8853 + }, + { + "epoch": 1.1263198066403766, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.735798954963684, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.857623279094696, + "num_tokens": 337714082.0, + "step": 8854 + }, + { + "epoch": 1.126447016918967, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.65714430809021, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8664262294769287, + "num_tokens": 337749446.0, + "step": 8855 + }, + { + "epoch": 1.1265742271975576, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.585594654083252, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.86407470703125, + "num_tokens": 337790841.0, + "step": 8856 + }, + { + "epoch": 1.1267014374761481, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5881121158599854, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8696444630622864, + "num_tokens": 337828992.0, + "step": 8857 + }, + { + "epoch": 1.1268286477547387, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.8458470106124878, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8729438781738281, + "num_tokens": 337856651.0, + "step": 8858 + }, + { + "epoch": 1.126955858033329, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6909393072128296, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8729193210601807, + "num_tokens": 337890475.0, + "step": 8859 + }, + { + "epoch": 1.1270830683119195, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5888175964355469, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.872978150844574, + "num_tokens": 337925911.0, + "step": 8860 + }, + { + "epoch": 1.12721027859051, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5685547590255737, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8745838403701782, + "num_tokens": 337964156.0, + "step": 8861 + }, + { + "epoch": 1.1273374888691006, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5812572240829468, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8717542886734009, + "num_tokens": 338003066.0, + "step": 8862 + }, + { + "epoch": 1.127464699147691, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7042665481567383, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8528899550437927, + "num_tokens": 338040909.0, + "step": 8863 + }, + { + "epoch": 1.1275919094262816, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.621307134628296, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8596615195274353, + "num_tokens": 338083145.0, + "step": 8864 + }, + { + "epoch": 1.1277191197048722, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4932758808135986, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8857162594795227, + "num_tokens": 338127624.0, + "step": 8865 + }, + { + "epoch": 1.1278463299834627, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5994853973388672, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8609130382537842, + "num_tokens": 338165562.0, + "step": 8866 + }, + { + "epoch": 1.1279735402620532, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6362980604171753, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8695687651634216, + "num_tokens": 338205906.0, + "step": 8867 + }, + { + "epoch": 1.1281007505406437, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5814200639724731, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8794705867767334, + "num_tokens": 338248020.0, + "step": 8868 + }, + { + "epoch": 1.1282279608192343, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6007721424102783, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8553656339645386, + "num_tokens": 338291702.0, + "step": 8869 + }, + { + "epoch": 1.1283551710978248, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.654433012008667, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8806213140487671, + "num_tokens": 338325951.0, + "step": 8870 + }, + { + "epoch": 1.1284823813764153, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6479135751724243, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8628678917884827, + "num_tokens": 338366206.0, + "step": 8871 + }, + { + "epoch": 1.1286095916550056, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5230048894882202, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8691576719284058, + "num_tokens": 338408022.0, + "step": 8872 + }, + { + "epoch": 1.1287368019335962, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.64153254032135, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8740215301513672, + "num_tokens": 338443892.0, + "step": 8873 + }, + { + "epoch": 1.1288640122121867, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.655561089515686, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8510799407958984, + "num_tokens": 338486445.0, + "step": 8874 + }, + { + "epoch": 1.1289912224907772, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6059514284133911, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8663685321807861, + "num_tokens": 338521236.0, + "step": 8875 + }, + { + "epoch": 1.1291184327693677, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6868256330490112, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8722934722900391, + "num_tokens": 338556976.0, + "step": 8876 + }, + { + "epoch": 1.1292456430479583, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6197859048843384, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8696131706237793, + "num_tokens": 338596443.0, + "step": 8877 + }, + { + "epoch": 1.1293728533265488, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5974785089492798, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8778945207595825, + "num_tokens": 338633748.0, + "step": 8878 + }, + { + "epoch": 1.1295000636051393, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5744011402130127, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8773761987686157, + "num_tokens": 338672345.0, + "step": 8879 + }, + { + "epoch": 1.1296272738837299, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5827299356460571, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8818786144256592, + "num_tokens": 338707279.0, + "step": 8880 + }, + { + "epoch": 1.1297544841623204, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7234442234039307, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8746805191040039, + "num_tokens": 338739666.0, + "step": 8881 + }, + { + "epoch": 1.129881694440911, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7148525714874268, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8680121302604675, + "num_tokens": 338775714.0, + "step": 8882 + }, + { + "epoch": 1.1300089047195012, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5711452960968018, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.856311559677124, + "num_tokens": 338819734.0, + "step": 8883 + }, + { + "epoch": 1.1301361149980917, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.631162166595459, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8766561150550842, + "num_tokens": 338859068.0, + "step": 8884 + }, + { + "epoch": 1.1302633252766823, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.636336326599121, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.864305853843689, + "num_tokens": 338899211.0, + "step": 8885 + }, + { + "epoch": 1.1303905355552728, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7511143684387207, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8496739864349365, + "num_tokens": 338935364.0, + "step": 8886 + }, + { + "epoch": 1.1305177458338633, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6676061153411865, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8566871881484985, + "num_tokens": 338969303.0, + "step": 8887 + }, + { + "epoch": 1.1306449561124539, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6176167726516724, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8681067824363708, + "num_tokens": 339009480.0, + "step": 8888 + }, + { + "epoch": 1.1307721663910444, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6803309917449951, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8602991104125977, + "num_tokens": 339046336.0, + "step": 8889 + }, + { + "epoch": 1.130899376669635, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6238118410110474, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8597339391708374, + "num_tokens": 339085302.0, + "step": 8890 + }, + { + "epoch": 1.1310265869482254, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5534682273864746, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8748691082000732, + "num_tokens": 339123635.0, + "step": 8891 + }, + { + "epoch": 1.131153797226816, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.577203631401062, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8701227307319641, + "num_tokens": 339161874.0, + "step": 8892 + }, + { + "epoch": 1.1312810075054065, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.668088436126709, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8819758892059326, + "num_tokens": 339193974.0, + "step": 8893 + }, + { + "epoch": 1.131408217783997, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.606218695640564, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8617269992828369, + "num_tokens": 339232472.0, + "step": 8894 + }, + { + "epoch": 1.1315354280625876, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7274783849716187, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8701409101486206, + "num_tokens": 339273242.0, + "step": 8895 + }, + { + "epoch": 1.131662638341178, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5884021520614624, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8725631237030029, + "num_tokens": 339314108.0, + "step": 8896 + }, + { + "epoch": 1.1317898486197684, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6351253986358643, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8617660999298096, + "num_tokens": 339354652.0, + "step": 8897 + }, + { + "epoch": 1.131917058898359, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.618931531906128, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8646515607833862, + "num_tokens": 339391811.0, + "step": 8898 + }, + { + "epoch": 1.1320442691769494, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4452953338623047, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8687373995780945, + "num_tokens": 339438226.0, + "step": 8899 + }, + { + "epoch": 1.13217147945554, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4593162536621094, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8756073713302612, + "num_tokens": 339480547.0, + "step": 8900 + }, + { + "epoch": 1.1322986897341305, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6919903755187988, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8549586534500122, + "num_tokens": 339519479.0, + "step": 8901 + }, + { + "epoch": 1.132425900012721, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7177811861038208, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8664130568504333, + "num_tokens": 339553336.0, + "step": 8902 + }, + { + "epoch": 1.1325531102913116, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.522048830986023, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8808110356330872, + "num_tokens": 339593657.0, + "step": 8903 + }, + { + "epoch": 1.132680320569902, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5857709646224976, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8656182289123535, + "num_tokens": 339634201.0, + "step": 8904 + }, + { + "epoch": 1.1328075308484926, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6323533058166504, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8693816661834717, + "num_tokens": 339672711.0, + "step": 8905 + }, + { + "epoch": 1.1329347411270831, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5869970321655273, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8692243099212646, + "num_tokens": 339708283.0, + "step": 8906 + }, + { + "epoch": 1.1330619514056737, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5490474700927734, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8828235864639282, + "num_tokens": 339743541.0, + "step": 8907 + }, + { + "epoch": 1.133189161684264, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6989266872406006, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8630642294883728, + "num_tokens": 339781348.0, + "step": 8908 + }, + { + "epoch": 1.1333163719628545, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7665313482284546, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8678028583526611, + "num_tokens": 339814250.0, + "step": 8909 + }, + { + "epoch": 1.133443582241445, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5449085235595703, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8717687129974365, + "num_tokens": 339857720.0, + "step": 8910 + }, + { + "epoch": 1.1335707925200356, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5113329887390137, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8762010335922241, + "num_tokens": 339898664.0, + "step": 8911 + }, + { + "epoch": 1.133698002798626, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5816783905029297, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8828942179679871, + "num_tokens": 339937281.0, + "step": 8912 + }, + { + "epoch": 1.1338252130772166, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4872157573699951, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8811835646629333, + "num_tokens": 339977100.0, + "step": 8913 + }, + { + "epoch": 1.1339524233558071, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6683363914489746, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8585701584815979, + "num_tokens": 340014933.0, + "step": 8914 + }, + { + "epoch": 1.1340796336343977, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.8009443283081055, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8530467748641968, + "num_tokens": 340048954.0, + "step": 8915 + }, + { + "epoch": 1.1342068439129882, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.60458505153656, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8614879250526428, + "num_tokens": 340089084.0, + "step": 8916 + }, + { + "epoch": 1.1343340541915787, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5326377153396606, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8505253791809082, + "num_tokens": 340134177.0, + "step": 8917 + }, + { + "epoch": 1.1344612644701693, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7676457166671753, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8649371862411499, + "num_tokens": 340170856.0, + "step": 8918 + }, + { + "epoch": 1.1345884747487598, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5155407190322876, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8674734830856323, + "num_tokens": 340212593.0, + "step": 8919 + }, + { + "epoch": 1.1347156850273503, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5977107286453247, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8730602860450745, + "num_tokens": 340249791.0, + "step": 8920 + }, + { + "epoch": 1.1348428953059406, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.648427128791809, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.855415940284729, + "num_tokens": 340288874.0, + "step": 8921 + }, + { + "epoch": 1.1349701055845312, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.4132137298583984, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8936555981636047, + "num_tokens": 340330384.0, + "step": 8922 + }, + { + "epoch": 1.1350973158631217, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.618816614151001, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8659025430679321, + "num_tokens": 340368586.0, + "step": 8923 + }, + { + "epoch": 1.1352245261417122, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6618397235870361, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8689314126968384, + "num_tokens": 340403033.0, + "step": 8924 + }, + { + "epoch": 1.1353517364203027, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.638911247253418, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8846550583839417, + "num_tokens": 340437611.0, + "step": 8925 + }, + { + "epoch": 1.1354789466988933, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5140935182571411, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8762726783752441, + "num_tokens": 340479917.0, + "step": 8926 + }, + { + "epoch": 1.1356061569774838, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.440934419631958, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8792343139648438, + "num_tokens": 340521803.0, + "step": 8927 + }, + { + "epoch": 1.1357333672560743, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6052402257919312, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8791479468345642, + "num_tokens": 340557246.0, + "step": 8928 + }, + { + "epoch": 1.1358605775346649, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6723002195358276, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8633394241333008, + "num_tokens": 340594139.0, + "step": 8929 + }, + { + "epoch": 1.1359877878132554, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5180624723434448, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8737243413925171, + "num_tokens": 340639372.0, + "step": 8930 + }, + { + "epoch": 1.136114998091846, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.689635992050171, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8782503604888916, + "num_tokens": 340676990.0, + "step": 8931 + }, + { + "epoch": 1.1362422083704362, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5651276111602783, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8633583784103394, + "num_tokens": 340721864.0, + "step": 8932 + }, + { + "epoch": 1.1363694186490267, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5918487310409546, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8723671436309814, + "num_tokens": 340759841.0, + "step": 8933 + }, + { + "epoch": 1.1364966289276173, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6648904085159302, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8761235475540161, + "num_tokens": 340796909.0, + "step": 8934 + }, + { + "epoch": 1.1366238392062078, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5750021934509277, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.88419508934021, + "num_tokens": 340837816.0, + "step": 8935 + }, + { + "epoch": 1.1367510494847983, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6620861291885376, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8656255006790161, + "num_tokens": 340876338.0, + "step": 8936 + }, + { + "epoch": 1.1368782597633889, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6744228601455688, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8648894429206848, + "num_tokens": 340913151.0, + "step": 8937 + }, + { + "epoch": 1.1370054700419794, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5857113599777222, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8757773041725159, + "num_tokens": 340950829.0, + "step": 8938 + }, + { + "epoch": 1.13713268032057, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.453521728515625, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8706164956092834, + "num_tokens": 340996026.0, + "step": 8939 + }, + { + "epoch": 1.1372598905991604, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5880216360092163, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8701841831207275, + "num_tokens": 341034746.0, + "step": 8940 + }, + { + "epoch": 1.137387100877751, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6836907863616943, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8711919784545898, + "num_tokens": 341071138.0, + "step": 8941 + }, + { + "epoch": 1.1375143111563415, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.729974627494812, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8614881634712219, + "num_tokens": 341106699.0, + "step": 8942 + }, + { + "epoch": 1.137641521434932, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5850056409835815, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8789612054824829, + "num_tokens": 341144093.0, + "step": 8943 + }, + { + "epoch": 1.1377687317135226, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6988451480865479, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8613137602806091, + "num_tokens": 341182240.0, + "step": 8944 + }, + { + "epoch": 1.137895941992113, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7035428285598755, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8625664710998535, + "num_tokens": 341217065.0, + "step": 8945 + }, + { + "epoch": 1.1380231522707034, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6074708700180054, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8736004829406738, + "num_tokens": 341257284.0, + "step": 8946 + }, + { + "epoch": 1.138150362549294, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5283820629119873, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8744051456451416, + "num_tokens": 341294339.0, + "step": 8947 + }, + { + "epoch": 1.1382775728278844, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5853168964385986, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8492857217788696, + "num_tokens": 341340484.0, + "step": 8948 + }, + { + "epoch": 1.138404783106475, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4785692691802979, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8807653784751892, + "num_tokens": 341383396.0, + "step": 8949 + }, + { + "epoch": 1.1385319933850655, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6688414812088013, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8822853565216064, + "num_tokens": 341417517.0, + "step": 8950 + }, + { + "epoch": 1.138659203663656, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6163456439971924, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8651716709136963, + "num_tokens": 341456855.0, + "step": 8951 + }, + { + "epoch": 1.1387864139422466, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4853647947311401, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8740811347961426, + "num_tokens": 341497720.0, + "step": 8952 + }, + { + "epoch": 1.138913624220837, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7037944793701172, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8484448194503784, + "num_tokens": 341537712.0, + "step": 8953 + }, + { + "epoch": 1.1390408344994276, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7542212009429932, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8689612150192261, + "num_tokens": 341571600.0, + "step": 8954 + }, + { + "epoch": 1.1391680447780181, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4757812023162842, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8707855939865112, + "num_tokens": 341617100.0, + "step": 8955 + }, + { + "epoch": 1.1392952550566087, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.557213544845581, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.880497932434082, + "num_tokens": 341653247.0, + "step": 8956 + }, + { + "epoch": 1.139422465335199, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7200515270233154, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8641514778137207, + "num_tokens": 341687734.0, + "step": 8957 + }, + { + "epoch": 1.1395496756137895, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.9200762510299683, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8735809326171875, + "num_tokens": 341718920.0, + "step": 8958 + }, + { + "epoch": 1.13967688589238, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4884719848632812, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8599121570587158, + "num_tokens": 341763571.0, + "step": 8959 + }, + { + "epoch": 1.1398040961709706, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7458018064498901, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8646344542503357, + "num_tokens": 341797074.0, + "step": 8960 + }, + { + "epoch": 1.139931306449561, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6582967042922974, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8698475360870361, + "num_tokens": 341833819.0, + "step": 8961 + }, + { + "epoch": 1.1400585167281516, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.779268503189087, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8718434572219849, + "num_tokens": 341864432.0, + "step": 8962 + }, + { + "epoch": 1.1401857270067421, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6012424230575562, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.868250846862793, + "num_tokens": 341905334.0, + "step": 8963 + }, + { + "epoch": 1.1403129372853327, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7033601999282837, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8583866357803345, + "num_tokens": 341947280.0, + "step": 8964 + }, + { + "epoch": 1.1404401475639232, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7342464923858643, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8759626150131226, + "num_tokens": 341982794.0, + "step": 8965 + }, + { + "epoch": 1.1405673578425137, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6342085599899292, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8676726222038269, + "num_tokens": 342023909.0, + "step": 8966 + }, + { + "epoch": 1.1406945681211043, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7654666900634766, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8573415279388428, + "num_tokens": 342060984.0, + "step": 8967 + }, + { + "epoch": 1.1408217783996948, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5699044466018677, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8720932006835938, + "num_tokens": 342100186.0, + "step": 8968 + }, + { + "epoch": 1.1409489886782853, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6035397052764893, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8564870357513428, + "num_tokens": 342141844.0, + "step": 8969 + }, + { + "epoch": 1.1410761989568756, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6519697904586792, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.864625096321106, + "num_tokens": 342180811.0, + "step": 8970 + }, + { + "epoch": 1.1412034092354661, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7206534147262573, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8809581995010376, + "num_tokens": 342213342.0, + "step": 8971 + }, + { + "epoch": 1.1413306195140567, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6643097400665283, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8925356864929199, + "num_tokens": 342249655.0, + "step": 8972 + }, + { + "epoch": 1.1414578297926472, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7396337985992432, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8620524406433105, + "num_tokens": 342283179.0, + "step": 8973 + }, + { + "epoch": 1.1415850400712377, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6191586256027222, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.872499406337738, + "num_tokens": 342322675.0, + "step": 8974 + }, + { + "epoch": 1.1417122503498283, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5920368432998657, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8676836490631104, + "num_tokens": 342363405.0, + "step": 8975 + }, + { + "epoch": 1.1418394606284188, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.666943073272705, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8729802966117859, + "num_tokens": 342406139.0, + "step": 8976 + }, + { + "epoch": 1.1419666709070093, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6400705575942993, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8798681497573853, + "num_tokens": 342444696.0, + "step": 8977 + }, + { + "epoch": 1.1420938811855998, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6166549921035767, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8713915944099426, + "num_tokens": 342479128.0, + "step": 8978 + }, + { + "epoch": 1.1422210914641904, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.723902940750122, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.862694263458252, + "num_tokens": 342512374.0, + "step": 8979 + }, + { + "epoch": 1.142348301742781, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7473690509796143, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8774077296257019, + "num_tokens": 342546319.0, + "step": 8980 + }, + { + "epoch": 1.1424755120213712, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6545850038528442, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8784826993942261, + "num_tokens": 342585755.0, + "step": 8981 + }, + { + "epoch": 1.1426027222999617, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5780624151229858, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8806372284889221, + "num_tokens": 342626113.0, + "step": 8982 + }, + { + "epoch": 1.1427299325785523, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6057398319244385, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8633666634559631, + "num_tokens": 342664543.0, + "step": 8983 + }, + { + "epoch": 1.1428571428571428, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.535672664642334, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8689095973968506, + "num_tokens": 342704969.0, + "step": 8984 + }, + { + "epoch": 1.1429843531357333, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.797722339630127, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8710553646087646, + "num_tokens": 342739486.0, + "step": 8985 + }, + { + "epoch": 1.1431115634143239, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5641274452209473, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8627465963363647, + "num_tokens": 342781310.0, + "step": 8986 + }, + { + "epoch": 1.1432387736929144, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7313997745513916, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8714906573295593, + "num_tokens": 342815873.0, + "step": 8987 + }, + { + "epoch": 1.143365983971505, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6876630783081055, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8843159675598145, + "num_tokens": 342851021.0, + "step": 8988 + }, + { + "epoch": 1.1434931942500954, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6881108283996582, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8599106073379517, + "num_tokens": 342890374.0, + "step": 8989 + }, + { + "epoch": 1.143620404528686, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6425052881240845, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8644030094146729, + "num_tokens": 342927845.0, + "step": 8990 + }, + { + "epoch": 1.1437476148072765, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7479681968688965, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8501372337341309, + "num_tokens": 342967852.0, + "step": 8991 + }, + { + "epoch": 1.143874825085867, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5787887573242188, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.877220630645752, + "num_tokens": 343007481.0, + "step": 8992 + }, + { + "epoch": 1.1440020353644575, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6617469787597656, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8836466073989868, + "num_tokens": 343042000.0, + "step": 8993 + }, + { + "epoch": 1.144129245643048, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5495705604553223, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8783196806907654, + "num_tokens": 343084934.0, + "step": 8994 + }, + { + "epoch": 1.1442564559216384, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.4427608251571655, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.874495267868042, + "num_tokens": 343127332.0, + "step": 8995 + }, + { + "epoch": 1.144383666200229, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5093632936477661, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8730483651161194, + "num_tokens": 343171836.0, + "step": 8996 + }, + { + "epoch": 1.1445108764788194, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7336137294769287, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8567520380020142, + "num_tokens": 343210376.0, + "step": 8997 + }, + { + "epoch": 1.14463808675741, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5176093578338623, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8825646638870239, + "num_tokens": 343251637.0, + "step": 8998 + }, + { + "epoch": 1.1447652970360005, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6028932332992554, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.875504732131958, + "num_tokens": 343290632.0, + "step": 8999 + }, + { + "epoch": 1.144892507314591, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4375789165496826, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8880146145820618, + "num_tokens": 343331239.0, + "step": 9000 + }, + { + "epoch": 1.1450197175931816, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4735766649246216, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8873468637466431, + "num_tokens": 343368845.0, + "step": 9001 + }, + { + "epoch": 1.145146927871772, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.598875880241394, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8783356547355652, + "num_tokens": 343405806.0, + "step": 9002 + }, + { + "epoch": 1.1452741381503626, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.626490831375122, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8668542504310608, + "num_tokens": 343444224.0, + "step": 9003 + }, + { + "epoch": 1.1454013484289531, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6236655712127686, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8638501167297363, + "num_tokens": 343486614.0, + "step": 9004 + }, + { + "epoch": 1.1455285587075437, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5237674713134766, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8767992854118347, + "num_tokens": 343526580.0, + "step": 9005 + }, + { + "epoch": 1.145655768986134, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6252413988113403, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8542060256004333, + "num_tokens": 343566560.0, + "step": 9006 + }, + { + "epoch": 1.1457829792647245, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.692038655281067, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8751320838928223, + "num_tokens": 343601978.0, + "step": 9007 + }, + { + "epoch": 1.145910189543315, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.8884599208831787, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8590342402458191, + "num_tokens": 343641845.0, + "step": 9008 + }, + { + "epoch": 1.1460373998219056, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7517688274383545, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8693774938583374, + "num_tokens": 343677755.0, + "step": 9009 + }, + { + "epoch": 1.146164610100496, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6761101484298706, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8593192100524902, + "num_tokens": 343715221.0, + "step": 9010 + }, + { + "epoch": 1.1462918203790866, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.475943684577942, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8547512888908386, + "num_tokens": 343760875.0, + "step": 9011 + }, + { + "epoch": 1.1464190306576771, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5963709354400635, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8750631213188171, + "num_tokens": 343800828.0, + "step": 9012 + }, + { + "epoch": 1.1465462409362677, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.774416208267212, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8560994863510132, + "num_tokens": 343835824.0, + "step": 9013 + }, + { + "epoch": 1.1466734512148582, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7057628631591797, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8472916483879089, + "num_tokens": 343879484.0, + "step": 9014 + }, + { + "epoch": 1.1468006614934487, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.500034213066101, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8613969087600708, + "num_tokens": 343923811.0, + "step": 9015 + }, + { + "epoch": 1.1469278717720393, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7629112005233765, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.843917965888977, + "num_tokens": 343961618.0, + "step": 9016 + }, + { + "epoch": 1.1470550820506298, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.659551978111267, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8770260810852051, + "num_tokens": 344002871.0, + "step": 9017 + }, + { + "epoch": 1.1471822923292203, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4788148403167725, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8732123970985413, + "num_tokens": 344044895.0, + "step": 9018 + }, + { + "epoch": 1.1473095026078106, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6262327432632446, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.869657576084137, + "num_tokens": 344079129.0, + "step": 9019 + }, + { + "epoch": 1.1474367128864011, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5556379556655884, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8549039363861084, + "num_tokens": 344124361.0, + "step": 9020 + }, + { + "epoch": 1.1475639231649917, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6019831895828247, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8868138790130615, + "num_tokens": 344158061.0, + "step": 9021 + }, + { + "epoch": 1.1476911334435822, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5450316667556763, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8778603076934814, + "num_tokens": 344197170.0, + "step": 9022 + }, + { + "epoch": 1.1478183437221727, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6049184799194336, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8602099418640137, + "num_tokens": 344237565.0, + "step": 9023 + }, + { + "epoch": 1.1479455540007633, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4858392477035522, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.879010796546936, + "num_tokens": 344282594.0, + "step": 9024 + }, + { + "epoch": 1.1480727642793538, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6574281454086304, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8719862699508667, + "num_tokens": 344318246.0, + "step": 9025 + }, + { + "epoch": 1.1481999745579443, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.4406805038452148, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8892809152603149, + "num_tokens": 344359874.0, + "step": 9026 + }, + { + "epoch": 1.1483271848365348, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5220600366592407, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8842411041259766, + "num_tokens": 344398595.0, + "step": 9027 + }, + { + "epoch": 1.1484543951151254, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6725339889526367, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8644262552261353, + "num_tokens": 344440375.0, + "step": 9028 + }, + { + "epoch": 1.148581605393716, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5846495628356934, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8762847781181335, + "num_tokens": 344478134.0, + "step": 9029 + }, + { + "epoch": 1.1487088156723062, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7417404651641846, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.855186939239502, + "num_tokens": 344512340.0, + "step": 9030 + }, + { + "epoch": 1.1488360259508967, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.737148404121399, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8701104521751404, + "num_tokens": 344548486.0, + "step": 9031 + }, + { + "epoch": 1.1489632362294873, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.559343934059143, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8667055368423462, + "num_tokens": 344588139.0, + "step": 9032 + }, + { + "epoch": 1.1490904465080778, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7816420793533325, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8614460229873657, + "num_tokens": 344621510.0, + "step": 9033 + }, + { + "epoch": 1.1492176567866683, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7024644613265991, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8651179075241089, + "num_tokens": 344657886.0, + "step": 9034 + }, + { + "epoch": 1.1493448670652588, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.7265896797180176, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8694949746131897, + "num_tokens": 344695931.0, + "step": 9035 + }, + { + "epoch": 1.1494720773438494, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6353321075439453, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8757055997848511, + "num_tokens": 344733742.0, + "step": 9036 + }, + { + "epoch": 1.14959928762244, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5594654083251953, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8543077111244202, + "num_tokens": 344777457.0, + "step": 9037 + }, + { + "epoch": 1.1497264979010304, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5757560729980469, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8716086149215698, + "num_tokens": 344814628.0, + "step": 9038 + }, + { + "epoch": 1.149853708179621, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.630296230316162, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8665592670440674, + "num_tokens": 344857946.0, + "step": 9039 + }, + { + "epoch": 1.1499809184582115, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6563938856124878, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8635765314102173, + "num_tokens": 344894565.0, + "step": 9040 + }, + { + "epoch": 1.150108128736802, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.607437252998352, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.877463698387146, + "num_tokens": 344935817.0, + "step": 9041 + }, + { + "epoch": 1.1502353390153925, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6295446157455444, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8682990074157715, + "num_tokens": 344974923.0, + "step": 9042 + }, + { + "epoch": 1.150362549293983, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6445348262786865, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8579180240631104, + "num_tokens": 345013574.0, + "step": 9043 + }, + { + "epoch": 1.1504897595725734, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.4926735162734985, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.864981472492218, + "num_tokens": 345056815.0, + "step": 9044 + }, + { + "epoch": 1.150616969851164, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.63303804397583, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8598539233207703, + "num_tokens": 345095500.0, + "step": 9045 + }, + { + "epoch": 1.1507441801297544, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.8507484197616577, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8714466691017151, + "num_tokens": 345131663.0, + "step": 9046 + }, + { + "epoch": 1.150871390408345, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6899185180664062, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8687599897384644, + "num_tokens": 345167157.0, + "step": 9047 + }, + { + "epoch": 1.1509986006869355, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6678770780563354, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8722608089447021, + "num_tokens": 345203630.0, + "step": 9048 + }, + { + "epoch": 1.151125810965526, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.600866436958313, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8526334166526794, + "num_tokens": 345248245.0, + "step": 9049 + }, + { + "epoch": 1.1512530212441165, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.649330496788025, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8650346994400024, + "num_tokens": 345286322.0, + "step": 9050 + }, + { + "epoch": 1.151380231522707, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6642714738845825, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8662296533584595, + "num_tokens": 345323609.0, + "step": 9051 + }, + { + "epoch": 1.1515074418012976, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6275672912597656, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8763154149055481, + "num_tokens": 345358973.0, + "step": 9052 + }, + { + "epoch": 1.1516346520798881, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5432136058807373, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8638549447059631, + "num_tokens": 345398676.0, + "step": 9053 + }, + { + "epoch": 1.1517618623584787, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5924052000045776, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8834601640701294, + "num_tokens": 345436066.0, + "step": 9054 + }, + { + "epoch": 1.151889072637069, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6282752752304077, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.869088888168335, + "num_tokens": 345472115.0, + "step": 9055 + }, + { + "epoch": 1.1520162829156595, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.513357162475586, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.882123589515686, + "num_tokens": 345512861.0, + "step": 9056 + }, + { + "epoch": 1.15214349319425, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.6491779088974, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8524723649024963, + "num_tokens": 345551000.0, + "step": 9057 + }, + { + "epoch": 1.1522707034728406, + "ewc_loss": 2.110004425048828e-05, + "grad_norm": 1.5584092140197754, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8622147440910339, + "num_tokens": 345591567.0, + "step": 9058 + }, + { + "epoch": 1.152397913751431, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6094324588775635, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8605157136917114, + "num_tokens": 345634190.0, + "step": 9059 + }, + { + "epoch": 1.1525251240300216, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6386617422103882, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8547571897506714, + "num_tokens": 345675253.0, + "step": 9060 + }, + { + "epoch": 1.1526523343086121, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5431212186813354, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8645950555801392, + "num_tokens": 345716664.0, + "step": 9061 + }, + { + "epoch": 1.1527795445872027, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.702958106994629, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8595348000526428, + "num_tokens": 345752929.0, + "step": 9062 + }, + { + "epoch": 1.1529067548657932, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5988959074020386, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8676954507827759, + "num_tokens": 345791001.0, + "step": 9063 + }, + { + "epoch": 1.1530339651443837, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6905544996261597, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8763557076454163, + "num_tokens": 345824498.0, + "step": 9064 + }, + { + "epoch": 1.1531611754229742, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6664361953735352, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.865953803062439, + "num_tokens": 345864282.0, + "step": 9065 + }, + { + "epoch": 1.1532883857015648, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6744561195373535, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8738420605659485, + "num_tokens": 345897625.0, + "step": 9066 + }, + { + "epoch": 1.1534155959801553, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.514093279838562, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8852646350860596, + "num_tokens": 345936238.0, + "step": 9067 + }, + { + "epoch": 1.1535428062587456, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7578507661819458, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8652668595314026, + "num_tokens": 345970638.0, + "step": 9068 + }, + { + "epoch": 1.1536700165373361, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6758756637573242, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8629451990127563, + "num_tokens": 346010709.0, + "step": 9069 + }, + { + "epoch": 1.1537972268159267, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.453303575515747, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8763526082038879, + "num_tokens": 346055482.0, + "step": 9070 + }, + { + "epoch": 1.1539244370945172, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6547342538833618, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8665467500686646, + "num_tokens": 346095623.0, + "step": 9071 + }, + { + "epoch": 1.1540516473731077, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.562135934829712, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8604106903076172, + "num_tokens": 346135748.0, + "step": 9072 + }, + { + "epoch": 1.1541788576516983, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.62652587890625, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8690091371536255, + "num_tokens": 346170428.0, + "step": 9073 + }, + { + "epoch": 1.1543060679302888, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6496044397354126, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8598114252090454, + "num_tokens": 346209382.0, + "step": 9074 + }, + { + "epoch": 1.1544332782088793, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6497529745101929, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8537893295288086, + "num_tokens": 346250459.0, + "step": 9075 + }, + { + "epoch": 1.1545604884874698, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6160187721252441, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8649268746376038, + "num_tokens": 346294028.0, + "step": 9076 + }, + { + "epoch": 1.1546876987660604, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5528672933578491, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8708995580673218, + "num_tokens": 346331905.0, + "step": 9077 + }, + { + "epoch": 1.154814909044651, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.552476406097412, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8835118412971497, + "num_tokens": 346370064.0, + "step": 9078 + }, + { + "epoch": 1.1549421193232412, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5258251428604126, + "learning_rate": 1e-06, + "loss": 0.3191, + "mean_token_accuracy": 0.8915500044822693, + "num_tokens": 346404629.0, + "step": 9079 + }, + { + "epoch": 1.1550693296018317, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7560486793518066, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8616887927055359, + "num_tokens": 346439552.0, + "step": 9080 + }, + { + "epoch": 1.1551965398804223, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6977275609970093, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8507025241851807, + "num_tokens": 346476456.0, + "step": 9081 + }, + { + "epoch": 1.1553237501590128, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6344506740570068, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8560206294059753, + "num_tokens": 346517672.0, + "step": 9082 + }, + { + "epoch": 1.1554509604376033, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7020288705825806, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8614920377731323, + "num_tokens": 346556586.0, + "step": 9083 + }, + { + "epoch": 1.1555781707161938, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.568633794784546, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8763009905815125, + "num_tokens": 346598356.0, + "step": 9084 + }, + { + "epoch": 1.1557053809947844, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5885525941848755, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8779042959213257, + "num_tokens": 346635402.0, + "step": 9085 + }, + { + "epoch": 1.155832591273375, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5807114839553833, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8762543201446533, + "num_tokens": 346673896.0, + "step": 9086 + }, + { + "epoch": 1.1559598015519654, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.809706687927246, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8740522265434265, + "num_tokens": 346706966.0, + "step": 9087 + }, + { + "epoch": 1.156087011830556, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6159418821334839, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8677687644958496, + "num_tokens": 346749228.0, + "step": 9088 + }, + { + "epoch": 1.1562142221091465, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7698466777801514, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8562002778053284, + "num_tokens": 346788499.0, + "step": 9089 + }, + { + "epoch": 1.156341432387737, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5606071949005127, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8688534498214722, + "num_tokens": 346827546.0, + "step": 9090 + }, + { + "epoch": 1.1564686426663275, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5936230421066284, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8709361553192139, + "num_tokens": 346865932.0, + "step": 9091 + }, + { + "epoch": 1.156595852944918, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6312998533248901, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8681639432907104, + "num_tokens": 346899910.0, + "step": 9092 + }, + { + "epoch": 1.1567230632235084, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7159414291381836, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8619818687438965, + "num_tokens": 346935176.0, + "step": 9093 + }, + { + "epoch": 1.156850273502099, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6797435283660889, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8766540288925171, + "num_tokens": 346970327.0, + "step": 9094 + }, + { + "epoch": 1.1569774837806894, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.588098168373108, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8802838325500488, + "num_tokens": 347006727.0, + "step": 9095 + }, + { + "epoch": 1.15710469405928, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5230481624603271, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8735634684562683, + "num_tokens": 347049894.0, + "step": 9096 + }, + { + "epoch": 1.1572319043378705, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6207386255264282, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8696489930152893, + "num_tokens": 347093028.0, + "step": 9097 + }, + { + "epoch": 1.157359114616461, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.644913911819458, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8788442015647888, + "num_tokens": 347133850.0, + "step": 9098 + }, + { + "epoch": 1.1574863248950515, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.7882064580917358, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8727911710739136, + "num_tokens": 347166684.0, + "step": 9099 + }, + { + "epoch": 1.157613535173642, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5903851985931396, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8616303205490112, + "num_tokens": 347206909.0, + "step": 9100 + }, + { + "epoch": 1.1577407454522326, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5839241743087769, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8698689341545105, + "num_tokens": 347247514.0, + "step": 9101 + }, + { + "epoch": 1.1578679557308231, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.8070573806762695, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.852825403213501, + "num_tokens": 347281418.0, + "step": 9102 + }, + { + "epoch": 1.1579951660094137, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5821589231491089, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8548817038536072, + "num_tokens": 347322188.0, + "step": 9103 + }, + { + "epoch": 1.158122376288004, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.643871784210205, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8820271492004395, + "num_tokens": 347355845.0, + "step": 9104 + }, + { + "epoch": 1.1582495865665945, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6369061470031738, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8503864407539368, + "num_tokens": 347394666.0, + "step": 9105 + }, + { + "epoch": 1.158376796845185, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6260038614273071, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8644426465034485, + "num_tokens": 347432125.0, + "step": 9106 + }, + { + "epoch": 1.1585040071237755, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.565004587173462, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8568867444992065, + "num_tokens": 347474716.0, + "step": 9107 + }, + { + "epoch": 1.158631217402366, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5256915092468262, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8717042803764343, + "num_tokens": 347516709.0, + "step": 9108 + }, + { + "epoch": 1.1587584276809566, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.588498592376709, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8703446388244629, + "num_tokens": 347554934.0, + "step": 9109 + }, + { + "epoch": 1.1588856379595471, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.6267602443695068, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8627653121948242, + "num_tokens": 347590546.0, + "step": 9110 + }, + { + "epoch": 1.1590128482381377, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5610383749008179, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8711853623390198, + "num_tokens": 347630569.0, + "step": 9111 + }, + { + "epoch": 1.1591400585167282, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5988622903823853, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8656628727912903, + "num_tokens": 347670627.0, + "step": 9112 + }, + { + "epoch": 1.1592672687953187, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5922478437423706, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8863000869750977, + "num_tokens": 347704956.0, + "step": 9113 + }, + { + "epoch": 1.1593944790739092, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.546956181526184, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8727409839630127, + "num_tokens": 347745870.0, + "step": 9114 + }, + { + "epoch": 1.1595216893524998, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5365958213806152, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8744269609451294, + "num_tokens": 347783571.0, + "step": 9115 + }, + { + "epoch": 1.1596488996310903, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6035637855529785, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8562328815460205, + "num_tokens": 347829294.0, + "step": 9116 + }, + { + "epoch": 1.1597761099096806, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6976044178009033, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8601338863372803, + "num_tokens": 347869631.0, + "step": 9117 + }, + { + "epoch": 1.1599033201882711, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.4565190076828003, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8851178884506226, + "num_tokens": 347915919.0, + "step": 9118 + }, + { + "epoch": 1.1600305304668617, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.536137342453003, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8549516797065735, + "num_tokens": 347960341.0, + "step": 9119 + }, + { + "epoch": 1.1601577407454522, + "ewc_loss": 2.1219253540039062e-05, + "grad_norm": 1.5941952466964722, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8596669435501099, + "num_tokens": 347998747.0, + "step": 9120 + }, + { + "epoch": 1.1602849510240427, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.602527141571045, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8588824272155762, + "num_tokens": 348037084.0, + "step": 9121 + }, + { + "epoch": 1.1604121613026332, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5270953178405762, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8756681680679321, + "num_tokens": 348077145.0, + "step": 9122 + }, + { + "epoch": 1.1605393715812238, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.621286392211914, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8608046770095825, + "num_tokens": 348116568.0, + "step": 9123 + }, + { + "epoch": 1.1606665818598143, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6162408590316772, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8558601140975952, + "num_tokens": 348154796.0, + "step": 9124 + }, + { + "epoch": 1.1607937921384048, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6574612855911255, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8617286682128906, + "num_tokens": 348195823.0, + "step": 9125 + }, + { + "epoch": 1.1609210024169954, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.608763337135315, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8833850622177124, + "num_tokens": 348231112.0, + "step": 9126 + }, + { + "epoch": 1.161048212695586, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5679219961166382, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8778621554374695, + "num_tokens": 348272711.0, + "step": 9127 + }, + { + "epoch": 1.1611754229741762, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6295667886734009, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8676643371582031, + "num_tokens": 348308048.0, + "step": 9128 + }, + { + "epoch": 1.1613026332527667, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6650108098983765, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8649324178695679, + "num_tokens": 348344315.0, + "step": 9129 + }, + { + "epoch": 1.1614298435313573, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5299336910247803, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8646950721740723, + "num_tokens": 348388693.0, + "step": 9130 + }, + { + "epoch": 1.1615570538099478, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5828495025634766, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8533704280853271, + "num_tokens": 348432943.0, + "step": 9131 + }, + { + "epoch": 1.1616842640885383, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7276097536087036, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8815305233001709, + "num_tokens": 348468264.0, + "step": 9132 + }, + { + "epoch": 1.1618114743671288, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6388298273086548, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8810967803001404, + "num_tokens": 348503534.0, + "step": 9133 + }, + { + "epoch": 1.1619386846457194, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.612436294555664, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8834505677223206, + "num_tokens": 348537555.0, + "step": 9134 + }, + { + "epoch": 1.16206589492431, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7003986835479736, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8562397956848145, + "num_tokens": 348573579.0, + "step": 9135 + }, + { + "epoch": 1.1621931052029004, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6375391483306885, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8852135539054871, + "num_tokens": 348606353.0, + "step": 9136 + }, + { + "epoch": 1.162320315481491, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.621708869934082, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8730817437171936, + "num_tokens": 348647170.0, + "step": 9137 + }, + { + "epoch": 1.1624475257600815, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6383377313613892, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8824245929718018, + "num_tokens": 348685669.0, + "step": 9138 + }, + { + "epoch": 1.162574736038672, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4317303895950317, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8852221369743347, + "num_tokens": 348727998.0, + "step": 9139 + }, + { + "epoch": 1.1627019463172625, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.524753212928772, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8786324262619019, + "num_tokens": 348768113.0, + "step": 9140 + }, + { + "epoch": 1.162829156595853, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.582878589630127, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8724391460418701, + "num_tokens": 348808204.0, + "step": 9141 + }, + { + "epoch": 1.1629563668744434, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6252151727676392, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8747726678848267, + "num_tokens": 348844379.0, + "step": 9142 + }, + { + "epoch": 1.163083577153034, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7729406356811523, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8734908103942871, + "num_tokens": 348879003.0, + "step": 9143 + }, + { + "epoch": 1.1632107874316244, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.598325490951538, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8689553737640381, + "num_tokens": 348917696.0, + "step": 9144 + }, + { + "epoch": 1.163337997710215, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6408932209014893, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8744253516197205, + "num_tokens": 348954706.0, + "step": 9145 + }, + { + "epoch": 1.1634652079888055, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4804015159606934, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8653407096862793, + "num_tokens": 348997335.0, + "step": 9146 + }, + { + "epoch": 1.163592418267396, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6198275089263916, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8546165227890015, + "num_tokens": 349035972.0, + "step": 9147 + }, + { + "epoch": 1.1637196285459865, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5698728561401367, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8682960867881775, + "num_tokens": 349078724.0, + "step": 9148 + }, + { + "epoch": 1.163846838824577, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4999945163726807, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8656253218650818, + "num_tokens": 349121838.0, + "step": 9149 + }, + { + "epoch": 1.1639740491031676, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.598740816116333, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8689371943473816, + "num_tokens": 349158457.0, + "step": 9150 + }, + { + "epoch": 1.1641012593817581, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6118067502975464, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.882016122341156, + "num_tokens": 349192124.0, + "step": 9151 + }, + { + "epoch": 1.1642284696603487, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6341633796691895, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8790780305862427, + "num_tokens": 349229129.0, + "step": 9152 + }, + { + "epoch": 1.164355679938939, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.623814344406128, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8657338619232178, + "num_tokens": 349269980.0, + "step": 9153 + }, + { + "epoch": 1.1644828902175295, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6845234632492065, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8766871690750122, + "num_tokens": 349306266.0, + "step": 9154 + }, + { + "epoch": 1.16461010049612, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7653682231903076, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8726637959480286, + "num_tokens": 349337877.0, + "step": 9155 + }, + { + "epoch": 1.1647373107747105, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5591578483581543, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.8899712562561035, + "num_tokens": 349373062.0, + "step": 9156 + }, + { + "epoch": 1.164864521053301, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5623499155044556, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8834363222122192, + "num_tokens": 349410589.0, + "step": 9157 + }, + { + "epoch": 1.1649917313318916, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7533764839172363, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8569958806037903, + "num_tokens": 349447169.0, + "step": 9158 + }, + { + "epoch": 1.1651189416104821, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6728546619415283, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8714051246643066, + "num_tokens": 349478281.0, + "step": 9159 + }, + { + "epoch": 1.1652461518890727, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.8364956378936768, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8526066541671753, + "num_tokens": 349514293.0, + "step": 9160 + }, + { + "epoch": 1.1653733621676632, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6687954664230347, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8825756311416626, + "num_tokens": 349546654.0, + "step": 9161 + }, + { + "epoch": 1.1655005724462537, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6955708265304565, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.873805046081543, + "num_tokens": 349580409.0, + "step": 9162 + }, + { + "epoch": 1.1656277827248442, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6201303005218506, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8675345182418823, + "num_tokens": 349618292.0, + "step": 9163 + }, + { + "epoch": 1.1657549930034348, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7722418308258057, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8577895760536194, + "num_tokens": 349655250.0, + "step": 9164 + }, + { + "epoch": 1.1658822032820253, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6571242809295654, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8671272993087769, + "num_tokens": 349694994.0, + "step": 9165 + }, + { + "epoch": 1.1660094135606156, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7256557941436768, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8912721872329712, + "num_tokens": 349733551.0, + "step": 9166 + }, + { + "epoch": 1.1661366238392061, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.601922869682312, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8818449974060059, + "num_tokens": 349768422.0, + "step": 9167 + }, + { + "epoch": 1.1662638341177967, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7343146800994873, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8478654623031616, + "num_tokens": 349805423.0, + "step": 9168 + }, + { + "epoch": 1.1663910443963872, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5235763788223267, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8643649220466614, + "num_tokens": 349848590.0, + "step": 9169 + }, + { + "epoch": 1.1665182546749777, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.609208583831787, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8785127401351929, + "num_tokens": 349889191.0, + "step": 9170 + }, + { + "epoch": 1.1666454649535682, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4666181802749634, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8811799883842468, + "num_tokens": 349929850.0, + "step": 9171 + }, + { + "epoch": 1.1667726752321588, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7959343194961548, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8772838115692139, + "num_tokens": 349961656.0, + "step": 9172 + }, + { + "epoch": 1.1668998855107493, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6477937698364258, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8662392497062683, + "num_tokens": 350001589.0, + "step": 9173 + }, + { + "epoch": 1.1670270957893398, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7765947580337524, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8564960956573486, + "num_tokens": 350034271.0, + "step": 9174 + }, + { + "epoch": 1.1671543060679304, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.631235957145691, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8773672580718994, + "num_tokens": 350073418.0, + "step": 9175 + }, + { + "epoch": 1.1672815163465209, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5142847299575806, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8739818334579468, + "num_tokens": 350114147.0, + "step": 9176 + }, + { + "epoch": 1.1674087266251112, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5282686948776245, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8825501203536987, + "num_tokens": 350152507.0, + "step": 9177 + }, + { + "epoch": 1.1675359369037017, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6563066244125366, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.878449559211731, + "num_tokens": 350186591.0, + "step": 9178 + }, + { + "epoch": 1.1676631471822922, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.754326343536377, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8572465181350708, + "num_tokens": 350222959.0, + "step": 9179 + }, + { + "epoch": 1.1677903574608828, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5322377681732178, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8834438920021057, + "num_tokens": 350263968.0, + "step": 9180 + }, + { + "epoch": 1.1679175677394733, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6518762111663818, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8504327535629272, + "num_tokens": 350303301.0, + "step": 9181 + }, + { + "epoch": 1.1680447780180638, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7886905670166016, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8658965229988098, + "num_tokens": 350339157.0, + "step": 9182 + }, + { + "epoch": 1.1681719882966544, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.644585132598877, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8657937049865723, + "num_tokens": 350376033.0, + "step": 9183 + }, + { + "epoch": 1.168299198575245, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7287472486495972, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8548145294189453, + "num_tokens": 350410851.0, + "step": 9184 + }, + { + "epoch": 1.1684264088538354, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6644978523254395, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8681375980377197, + "num_tokens": 350449235.0, + "step": 9185 + }, + { + "epoch": 1.168553619132426, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6712161302566528, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8639273643493652, + "num_tokens": 350486764.0, + "step": 9186 + }, + { + "epoch": 1.1686808294110165, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.594761610031128, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8752481937408447, + "num_tokens": 350521152.0, + "step": 9187 + }, + { + "epoch": 1.168808039689607, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.586990237236023, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8647264242172241, + "num_tokens": 350561507.0, + "step": 9188 + }, + { + "epoch": 1.1689352499681975, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7127447128295898, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8644198179244995, + "num_tokens": 350600881.0, + "step": 9189 + }, + { + "epoch": 1.169062460246788, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7803139686584473, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8563370704650879, + "num_tokens": 350636674.0, + "step": 9190 + }, + { + "epoch": 1.1691896705253784, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6430926322937012, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8628343343734741, + "num_tokens": 350677875.0, + "step": 9191 + }, + { + "epoch": 1.169316880803969, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7904365062713623, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8765588402748108, + "num_tokens": 350712915.0, + "step": 9192 + }, + { + "epoch": 1.1694440910825594, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6754343509674072, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8510198593139648, + "num_tokens": 350751099.0, + "step": 9193 + }, + { + "epoch": 1.16957130136115, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6570055484771729, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8711026310920715, + "num_tokens": 350785668.0, + "step": 9194 + }, + { + "epoch": 1.1696985116397405, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6282985210418701, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.886827826499939, + "num_tokens": 350822189.0, + "step": 9195 + }, + { + "epoch": 1.169825721918331, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5302971601486206, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8729351758956909, + "num_tokens": 350861803.0, + "step": 9196 + }, + { + "epoch": 1.1699529321969215, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5846275091171265, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8719358444213867, + "num_tokens": 350904583.0, + "step": 9197 + }, + { + "epoch": 1.170080142475512, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6592488288879395, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.857202410697937, + "num_tokens": 350944411.0, + "step": 9198 + }, + { + "epoch": 1.1702073527541026, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.594618558883667, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8776193857192993, + "num_tokens": 350982469.0, + "step": 9199 + }, + { + "epoch": 1.1703345630326931, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6018338203430176, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8805680274963379, + "num_tokens": 351021733.0, + "step": 9200 + }, + { + "epoch": 1.1704617733112836, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4971421957015991, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8692522644996643, + "num_tokens": 351067909.0, + "step": 9201 + }, + { + "epoch": 1.170588983589874, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5332615375518799, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8835437297821045, + "num_tokens": 351106454.0, + "step": 9202 + }, + { + "epoch": 1.1707161938684645, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.525613784790039, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8837087154388428, + "num_tokens": 351145238.0, + "step": 9203 + }, + { + "epoch": 1.170843404147055, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5890955924987793, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8692591190338135, + "num_tokens": 351183718.0, + "step": 9204 + }, + { + "epoch": 1.1709706144256455, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7620954513549805, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8565541505813599, + "num_tokens": 351218188.0, + "step": 9205 + }, + { + "epoch": 1.171097824704236, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.2023746967315674, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8631479740142822, + "num_tokens": 351262627.0, + "step": 9206 + }, + { + "epoch": 1.1712250349828266, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6187440156936646, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8770347833633423, + "num_tokens": 351305786.0, + "step": 9207 + }, + { + "epoch": 1.1713522452614171, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.8280118703842163, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8634097576141357, + "num_tokens": 351339846.0, + "step": 9208 + }, + { + "epoch": 1.1714794555400077, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 4.663363456726074, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8731379508972168, + "num_tokens": 351377181.0, + "step": 9209 + }, + { + "epoch": 1.1716066658185982, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.53615140914917, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.86031174659729, + "num_tokens": 351423235.0, + "step": 9210 + }, + { + "epoch": 1.1717338760971887, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.70140540599823, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8727323412895203, + "num_tokens": 351460740.0, + "step": 9211 + }, + { + "epoch": 1.1718610863757792, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6843074560165405, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8780864477157593, + "num_tokens": 351494259.0, + "step": 9212 + }, + { + "epoch": 1.1719882966543698, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7852691411972046, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8744876384735107, + "num_tokens": 351523142.0, + "step": 9213 + }, + { + "epoch": 1.1721155069329603, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6132440567016602, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8568811416625977, + "num_tokens": 351561707.0, + "step": 9214 + }, + { + "epoch": 1.1722427172115506, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6455734968185425, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.863072395324707, + "num_tokens": 351601924.0, + "step": 9215 + }, + { + "epoch": 1.1723699274901411, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6305755376815796, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8648413419723511, + "num_tokens": 351639456.0, + "step": 9216 + }, + { + "epoch": 1.1724971377687317, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6006121635437012, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.875792920589447, + "num_tokens": 351679283.0, + "step": 9217 + }, + { + "epoch": 1.1726243480473222, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5966746807098389, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8651636838912964, + "num_tokens": 351719005.0, + "step": 9218 + }, + { + "epoch": 1.1727515583259127, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5798029899597168, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8699491024017334, + "num_tokens": 351755712.0, + "step": 9219 + }, + { + "epoch": 1.1728787686045032, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5680965185165405, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8671009540557861, + "num_tokens": 351796754.0, + "step": 9220 + }, + { + "epoch": 1.1730059788830938, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6772732734680176, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8785157203674316, + "num_tokens": 351831781.0, + "step": 9221 + }, + { + "epoch": 1.1731331891616843, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4364911317825317, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8853175044059753, + "num_tokens": 351875709.0, + "step": 9222 + }, + { + "epoch": 1.1732603994402748, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.8653044700622559, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8826299905776978, + "num_tokens": 351911494.0, + "step": 9223 + }, + { + "epoch": 1.1733876097188654, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6808401346206665, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8584976196289062, + "num_tokens": 351946645.0, + "step": 9224 + }, + { + "epoch": 1.1735148199974559, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 2.341261386871338, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8656194806098938, + "num_tokens": 351982154.0, + "step": 9225 + }, + { + "epoch": 1.1736420302760462, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.800200343132019, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8733506202697754, + "num_tokens": 352013743.0, + "step": 9226 + }, + { + "epoch": 1.1737692405546367, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6139403581619263, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8661818504333496, + "num_tokens": 352058606.0, + "step": 9227 + }, + { + "epoch": 1.1738964508332272, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7545162439346313, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8769677877426147, + "num_tokens": 352093044.0, + "step": 9228 + }, + { + "epoch": 1.1740236611118178, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.603136658668518, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8623074889183044, + "num_tokens": 352131103.0, + "step": 9229 + }, + { + "epoch": 1.1741508713904083, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6308033466339111, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8660820126533508, + "num_tokens": 352165974.0, + "step": 9230 + }, + { + "epoch": 1.1742780816689988, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7293099164962769, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8566498756408691, + "num_tokens": 352203631.0, + "step": 9231 + }, + { + "epoch": 1.1744052919475894, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5926159620285034, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8716902136802673, + "num_tokens": 352243563.0, + "step": 9232 + }, + { + "epoch": 1.1745325022261799, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7895890474319458, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8620719909667969, + "num_tokens": 352276892.0, + "step": 9233 + }, + { + "epoch": 1.1746597125047704, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6923961639404297, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8768033981323242, + "num_tokens": 352315235.0, + "step": 9234 + }, + { + "epoch": 1.174786922783361, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6916966438293457, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8717942237854004, + "num_tokens": 352348534.0, + "step": 9235 + }, + { + "epoch": 1.1749141330619515, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5655651092529297, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8657110929489136, + "num_tokens": 352389400.0, + "step": 9236 + }, + { + "epoch": 1.175041343340542, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6719547510147095, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8620262742042542, + "num_tokens": 352426957.0, + "step": 9237 + }, + { + "epoch": 1.1751685536191325, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5894746780395508, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8847308158874512, + "num_tokens": 352465264.0, + "step": 9238 + }, + { + "epoch": 1.175295763897723, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5447889566421509, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8669561147689819, + "num_tokens": 352513548.0, + "step": 9239 + }, + { + "epoch": 1.1754229741763134, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.745439052581787, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8564964532852173, + "num_tokens": 352549553.0, + "step": 9240 + }, + { + "epoch": 1.175550184454904, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5609519481658936, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.866290807723999, + "num_tokens": 352589461.0, + "step": 9241 + }, + { + "epoch": 1.1756773947334944, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5048167705535889, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8811861276626587, + "num_tokens": 352630861.0, + "step": 9242 + }, + { + "epoch": 1.175804605012085, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5863308906555176, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8819943070411682, + "num_tokens": 352666525.0, + "step": 9243 + }, + { + "epoch": 1.1759318152906755, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.651415228843689, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8767309188842773, + "num_tokens": 352701423.0, + "step": 9244 + }, + { + "epoch": 1.176059025569266, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6203117370605469, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8671218156814575, + "num_tokens": 352739927.0, + "step": 9245 + }, + { + "epoch": 1.1761862358478565, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7240755558013916, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8410329818725586, + "num_tokens": 352779406.0, + "step": 9246 + }, + { + "epoch": 1.176313446126447, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7689582109451294, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8602302670478821, + "num_tokens": 352819229.0, + "step": 9247 + }, + { + "epoch": 1.1764406564050376, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5107864141464233, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8812505006790161, + "num_tokens": 352857057.0, + "step": 9248 + }, + { + "epoch": 1.1765678666836281, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7422596216201782, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8630131483078003, + "num_tokens": 352898301.0, + "step": 9249 + }, + { + "epoch": 1.1766950769622184, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.662355661392212, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8615322113037109, + "num_tokens": 352935699.0, + "step": 9250 + }, + { + "epoch": 1.176822287240809, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7481467723846436, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8802615404129028, + "num_tokens": 352969486.0, + "step": 9251 + }, + { + "epoch": 1.1769494975193995, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 4.657122611999512, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8648327589035034, + "num_tokens": 353004211.0, + "step": 9252 + }, + { + "epoch": 1.17707670779799, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.532157063484192, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8816462159156799, + "num_tokens": 353045325.0, + "step": 9253 + }, + { + "epoch": 1.1772039180765805, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.8846766948699951, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8692427277565002, + "num_tokens": 353085843.0, + "step": 9254 + }, + { + "epoch": 1.177331128355171, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7392187118530273, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8632333278656006, + "num_tokens": 353123562.0, + "step": 9255 + }, + { + "epoch": 1.1774583386337616, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.8180981874465942, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8660255074501038, + "num_tokens": 353158328.0, + "step": 9256 + }, + { + "epoch": 1.1775855489123521, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4133676290512085, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8840286135673523, + "num_tokens": 353201328.0, + "step": 9257 + }, + { + "epoch": 1.1777127591909426, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6938635110855103, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8812923431396484, + "num_tokens": 353234148.0, + "step": 9258 + }, + { + "epoch": 1.1778399694695332, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5853822231292725, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8780092000961304, + "num_tokens": 353273104.0, + "step": 9259 + }, + { + "epoch": 1.1779671797481237, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6488776206970215, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.863120436668396, + "num_tokens": 353314759.0, + "step": 9260 + }, + { + "epoch": 1.1780943900267142, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5695303678512573, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8710904717445374, + "num_tokens": 353358682.0, + "step": 9261 + }, + { + "epoch": 1.1782216003053048, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6528255939483643, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8630114793777466, + "num_tokens": 353394871.0, + "step": 9262 + }, + { + "epoch": 1.1783488105838953, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4790639877319336, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8827837109565735, + "num_tokens": 353434769.0, + "step": 9263 + }, + { + "epoch": 1.1784760208624856, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7116209268569946, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8572378754615784, + "num_tokens": 353473034.0, + "step": 9264 + }, + { + "epoch": 1.1786032311410761, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4715721607208252, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8905751705169678, + "num_tokens": 353517900.0, + "step": 9265 + }, + { + "epoch": 1.1787304414196667, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6507309675216675, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8792906999588013, + "num_tokens": 353551836.0, + "step": 9266 + }, + { + "epoch": 1.1788576516982572, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6184911727905273, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.869981050491333, + "num_tokens": 353588441.0, + "step": 9267 + }, + { + "epoch": 1.1789848619768477, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5688904523849487, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8762398958206177, + "num_tokens": 353627245.0, + "step": 9268 + }, + { + "epoch": 1.1791120722554382, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4765946865081787, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8819616436958313, + "num_tokens": 353669585.0, + "step": 9269 + }, + { + "epoch": 1.1792392825340288, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5064386129379272, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8720375299453735, + "num_tokens": 353712539.0, + "step": 9270 + }, + { + "epoch": 1.1793664928126193, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5567039251327515, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8796974420547485, + "num_tokens": 353753552.0, + "step": 9271 + }, + { + "epoch": 1.1794937030912098, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6283985376358032, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8615121841430664, + "num_tokens": 353792672.0, + "step": 9272 + }, + { + "epoch": 1.1796209133698004, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.661755919456482, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8785591125488281, + "num_tokens": 353831757.0, + "step": 9273 + }, + { + "epoch": 1.1797481236483909, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6271274089813232, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8852400779724121, + "num_tokens": 353870022.0, + "step": 9274 + }, + { + "epoch": 1.1798753339269812, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5715248584747314, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8782002329826355, + "num_tokens": 353906854.0, + "step": 9275 + }, + { + "epoch": 1.1800025442055717, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7722476720809937, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8752210736274719, + "num_tokens": 353937155.0, + "step": 9276 + }, + { + "epoch": 1.1801297544841622, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7134555578231812, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8608978986740112, + "num_tokens": 353974457.0, + "step": 9277 + }, + { + "epoch": 1.1802569647627528, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6853443384170532, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.88200443983078, + "num_tokens": 354008499.0, + "step": 9278 + }, + { + "epoch": 1.1803841750413433, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6764168739318848, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8681226968765259, + "num_tokens": 354045922.0, + "step": 9279 + }, + { + "epoch": 1.1805113853199338, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6592718362808228, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8570103645324707, + "num_tokens": 354082532.0, + "step": 9280 + }, + { + "epoch": 1.1806385955985244, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5783214569091797, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8679372668266296, + "num_tokens": 354124410.0, + "step": 9281 + }, + { + "epoch": 1.1807658058771149, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6048897504806519, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8745105266571045, + "num_tokens": 354161346.0, + "step": 9282 + }, + { + "epoch": 1.1808930161557054, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5553698539733887, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8743130564689636, + "num_tokens": 354201914.0, + "step": 9283 + }, + { + "epoch": 1.181020226434296, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.643349051475525, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8658803105354309, + "num_tokens": 354237939.0, + "step": 9284 + }, + { + "epoch": 1.1811474367128865, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5984224081039429, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8731642961502075, + "num_tokens": 354277031.0, + "step": 9285 + }, + { + "epoch": 1.181274646991477, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.677964210510254, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8634520173072815, + "num_tokens": 354312922.0, + "step": 9286 + }, + { + "epoch": 1.1814018572700675, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6051740646362305, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8592186570167542, + "num_tokens": 354353702.0, + "step": 9287 + }, + { + "epoch": 1.181529067548658, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.8225599527359009, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8634067177772522, + "num_tokens": 354390075.0, + "step": 9288 + }, + { + "epoch": 1.1816562778272484, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6170929670333862, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.872543454170227, + "num_tokens": 354430404.0, + "step": 9289 + }, + { + "epoch": 1.1817834881058389, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5359116792678833, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8799275159835815, + "num_tokens": 354467826.0, + "step": 9290 + }, + { + "epoch": 1.1819106983844294, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4678081274032593, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.881743848323822, + "num_tokens": 354512610.0, + "step": 9291 + }, + { + "epoch": 1.18203790866302, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.627218246459961, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8837364315986633, + "num_tokens": 354548894.0, + "step": 9292 + }, + { + "epoch": 1.1821651189416105, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6688883304595947, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8576679229736328, + "num_tokens": 354587672.0, + "step": 9293 + }, + { + "epoch": 1.182292329220201, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6941167116165161, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8566460609436035, + "num_tokens": 354626632.0, + "step": 9294 + }, + { + "epoch": 1.1824195394987915, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.554962158203125, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8765085935592651, + "num_tokens": 354666010.0, + "step": 9295 + }, + { + "epoch": 1.182546749777382, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6134909391403198, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.877568244934082, + "num_tokens": 354705802.0, + "step": 9296 + }, + { + "epoch": 1.1826739600559726, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5903865098953247, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8635941743850708, + "num_tokens": 354748833.0, + "step": 9297 + }, + { + "epoch": 1.1828011703345631, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6964540481567383, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8597802519798279, + "num_tokens": 354784918.0, + "step": 9298 + }, + { + "epoch": 1.1829283806131534, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.7630364894866943, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8684727549552917, + "num_tokens": 354816700.0, + "step": 9299 + }, + { + "epoch": 1.183055590891744, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.6037770509719849, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8475984334945679, + "num_tokens": 354859527.0, + "step": 9300 + }, + { + "epoch": 1.1831828011703345, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.645247220993042, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8702479600906372, + "num_tokens": 354897019.0, + "step": 9301 + }, + { + "epoch": 1.183310011448925, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.5197434425354004, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8790115118026733, + "num_tokens": 354937141.0, + "step": 9302 + }, + { + "epoch": 1.1834372217275155, + "ewc_loss": 2.1338462829589844e-05, + "grad_norm": 1.4969133138656616, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8839384317398071, + "num_tokens": 354976114.0, + "step": 9303 + }, + { + "epoch": 1.183564432006106, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4918951988220215, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.875726580619812, + "num_tokens": 355016198.0, + "step": 9304 + }, + { + "epoch": 1.1836916422846966, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4939838647842407, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8769031763076782, + "num_tokens": 355060596.0, + "step": 9305 + }, + { + "epoch": 1.1838188525632871, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.848693609237671, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8615584969520569, + "num_tokens": 355091564.0, + "step": 9306 + }, + { + "epoch": 1.1839460628418776, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.714927315711975, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8565146327018738, + "num_tokens": 355129066.0, + "step": 9307 + }, + { + "epoch": 1.1840732731204682, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7605841159820557, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8711656332015991, + "num_tokens": 355161202.0, + "step": 9308 + }, + { + "epoch": 1.1842004833990587, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6296191215515137, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8675287961959839, + "num_tokens": 355197124.0, + "step": 9309 + }, + { + "epoch": 1.1843276936776492, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.634077548980713, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8784822821617126, + "num_tokens": 355234308.0, + "step": 9310 + }, + { + "epoch": 1.1844549039562398, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.756269097328186, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8676570653915405, + "num_tokens": 355271024.0, + "step": 9311 + }, + { + "epoch": 1.1845821142348303, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6278880834579468, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8785476088523865, + "num_tokens": 355307463.0, + "step": 9312 + }, + { + "epoch": 1.1847093245134206, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4773939847946167, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8839235305786133, + "num_tokens": 355349246.0, + "step": 9313 + }, + { + "epoch": 1.1848365347920111, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.434272050857544, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8771371245384216, + "num_tokens": 355395868.0, + "step": 9314 + }, + { + "epoch": 1.1849637450706016, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.62526535987854, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8644682765007019, + "num_tokens": 355435056.0, + "step": 9315 + }, + { + "epoch": 1.1850909553491922, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5729198455810547, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8868880271911621, + "num_tokens": 355471821.0, + "step": 9316 + }, + { + "epoch": 1.1852181656277827, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6363422870635986, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8648712635040283, + "num_tokens": 355507176.0, + "step": 9317 + }, + { + "epoch": 1.1853453759063732, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5352325439453125, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.870951771736145, + "num_tokens": 355549828.0, + "step": 9318 + }, + { + "epoch": 1.1854725861849638, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6005785465240479, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8715089559555054, + "num_tokens": 355586833.0, + "step": 9319 + }, + { + "epoch": 1.1855997964635543, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5413296222686768, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8646140098571777, + "num_tokens": 355629251.0, + "step": 9320 + }, + { + "epoch": 1.1857270067421448, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.631665587425232, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8629149198532104, + "num_tokens": 355669658.0, + "step": 9321 + }, + { + "epoch": 1.1858542170207353, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.8417340517044067, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8537670373916626, + "num_tokens": 355703666.0, + "step": 9322 + }, + { + "epoch": 1.1859814272993259, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5289446115493774, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8832546472549438, + "num_tokens": 355741983.0, + "step": 9323 + }, + { + "epoch": 1.1861086375779162, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.8604575395584106, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.874228298664093, + "num_tokens": 355771029.0, + "step": 9324 + }, + { + "epoch": 1.1862358478565067, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.625796914100647, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8699040412902832, + "num_tokens": 355809831.0, + "step": 9325 + }, + { + "epoch": 1.1863630581350972, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6364742517471313, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8521332740783691, + "num_tokens": 355852537.0, + "step": 9326 + }, + { + "epoch": 1.1864902684136878, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.485765814781189, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8609933853149414, + "num_tokens": 355895171.0, + "step": 9327 + }, + { + "epoch": 1.1866174786922783, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4684126377105713, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8646869659423828, + "num_tokens": 355938418.0, + "step": 9328 + }, + { + "epoch": 1.1867446889708688, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6575145721435547, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8788406252861023, + "num_tokens": 355973727.0, + "step": 9329 + }, + { + "epoch": 1.1868718992494594, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6027638912200928, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8687081336975098, + "num_tokens": 356010860.0, + "step": 9330 + }, + { + "epoch": 1.1869991095280499, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5298407077789307, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8751300573348999, + "num_tokens": 356048013.0, + "step": 9331 + }, + { + "epoch": 1.1871263198066404, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.498854398727417, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8744333982467651, + "num_tokens": 356090268.0, + "step": 9332 + }, + { + "epoch": 1.187253530085231, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6469123363494873, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8816835880279541, + "num_tokens": 356125470.0, + "step": 9333 + }, + { + "epoch": 1.1873807403638215, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5805845260620117, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8675792217254639, + "num_tokens": 356168119.0, + "step": 9334 + }, + { + "epoch": 1.187507950642412, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.591464638710022, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8742771744728088, + "num_tokens": 356208382.0, + "step": 9335 + }, + { + "epoch": 1.1876351609210025, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5461605787277222, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8707898855209351, + "num_tokens": 356251378.0, + "step": 9336 + }, + { + "epoch": 1.187762371199593, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5759170055389404, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8632079362869263, + "num_tokens": 356294325.0, + "step": 9337 + }, + { + "epoch": 1.1878895814781834, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6957374811172485, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8618898391723633, + "num_tokens": 356331503.0, + "step": 9338 + }, + { + "epoch": 1.1880167917567739, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5400725603103638, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8684930801391602, + "num_tokens": 356375654.0, + "step": 9339 + }, + { + "epoch": 1.1881440020353644, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6784237623214722, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8796223402023315, + "num_tokens": 356409714.0, + "step": 9340 + }, + { + "epoch": 1.188271212313955, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.582342505455017, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8798707723617554, + "num_tokens": 356454576.0, + "step": 9341 + }, + { + "epoch": 1.1883984225925455, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6268151998519897, + "learning_rate": 1e-06, + "loss": 0.3222, + "mean_token_accuracy": 0.8952788710594177, + "num_tokens": 356490637.0, + "step": 9342 + }, + { + "epoch": 1.188525632871136, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6223464012145996, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8666576743125916, + "num_tokens": 356527772.0, + "step": 9343 + }, + { + "epoch": 1.1886528431497265, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6447685956954956, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8608834743499756, + "num_tokens": 356563791.0, + "step": 9344 + }, + { + "epoch": 1.188780053428317, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7139614820480347, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8618943095207214, + "num_tokens": 356600972.0, + "step": 9345 + }, + { + "epoch": 1.1889072637069076, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6740609407424927, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8856214284896851, + "num_tokens": 356638855.0, + "step": 9346 + }, + { + "epoch": 1.189034473985498, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.58695387840271, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8737984895706177, + "num_tokens": 356675231.0, + "step": 9347 + }, + { + "epoch": 1.1891616842640884, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6923998594284058, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8762145042419434, + "num_tokens": 356709573.0, + "step": 9348 + }, + { + "epoch": 1.189288894542679, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7185262441635132, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.865910530090332, + "num_tokens": 356741366.0, + "step": 9349 + }, + { + "epoch": 1.1894161048212695, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.538527011871338, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8766976594924927, + "num_tokens": 356783665.0, + "step": 9350 + }, + { + "epoch": 1.18954331509986, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7022701501846313, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8829229474067688, + "num_tokens": 356815930.0, + "step": 9351 + }, + { + "epoch": 1.1896705253784505, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.549893856048584, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8611463308334351, + "num_tokens": 356858530.0, + "step": 9352 + }, + { + "epoch": 1.189797735657041, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6161468029022217, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8666081428527832, + "num_tokens": 356899247.0, + "step": 9353 + }, + { + "epoch": 1.1899249459356316, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.578324556350708, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8689742088317871, + "num_tokens": 356939089.0, + "step": 9354 + }, + { + "epoch": 1.1900521562142221, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.767261266708374, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8785057067871094, + "num_tokens": 356971665.0, + "step": 9355 + }, + { + "epoch": 1.1901793664928126, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.581236481666565, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.87934410572052, + "num_tokens": 357010625.0, + "step": 9356 + }, + { + "epoch": 1.1903065767714032, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6115034818649292, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.877351701259613, + "num_tokens": 357049437.0, + "step": 9357 + }, + { + "epoch": 1.1904337870499937, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.581189513206482, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8820807337760925, + "num_tokens": 357084595.0, + "step": 9358 + }, + { + "epoch": 1.1905609973285842, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6558996438980103, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8657951951026917, + "num_tokens": 357121435.0, + "step": 9359 + }, + { + "epoch": 1.1906882076071748, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6852167844772339, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8811692595481873, + "num_tokens": 357156073.0, + "step": 9360 + }, + { + "epoch": 1.1908154178857653, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4885263442993164, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8768135905265808, + "num_tokens": 357198376.0, + "step": 9361 + }, + { + "epoch": 1.1909426281643556, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5933722257614136, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.872484564781189, + "num_tokens": 357235716.0, + "step": 9362 + }, + { + "epoch": 1.1910698384429461, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6088587045669556, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8726087212562561, + "num_tokens": 357272482.0, + "step": 9363 + }, + { + "epoch": 1.1911970487215366, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.663034439086914, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8605318069458008, + "num_tokens": 357307159.0, + "step": 9364 + }, + { + "epoch": 1.1913242590001272, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5247881412506104, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8700171709060669, + "num_tokens": 357348636.0, + "step": 9365 + }, + { + "epoch": 1.1914514692787177, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.749659776687622, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8611890077590942, + "num_tokens": 357384488.0, + "step": 9366 + }, + { + "epoch": 1.1915786795573082, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.580164909362793, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8850655555725098, + "num_tokens": 357420760.0, + "step": 9367 + }, + { + "epoch": 1.1917058898358988, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4916573762893677, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8597792387008667, + "num_tokens": 357466579.0, + "step": 9368 + }, + { + "epoch": 1.1918331001144893, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.639311671257019, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8502867817878723, + "num_tokens": 357507086.0, + "step": 9369 + }, + { + "epoch": 1.1919603103930798, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6735376119613647, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8496818542480469, + "num_tokens": 357546159.0, + "step": 9370 + }, + { + "epoch": 1.1920875206716703, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6790636777877808, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.869986355304718, + "num_tokens": 357582859.0, + "step": 9371 + }, + { + "epoch": 1.1922147309502609, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5707550048828125, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8845928907394409, + "num_tokens": 357624083.0, + "step": 9372 + }, + { + "epoch": 1.1923419412288512, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.551424264907837, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8771367073059082, + "num_tokens": 357663971.0, + "step": 9373 + }, + { + "epoch": 1.1924691515074417, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5904521942138672, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8730963468551636, + "num_tokens": 357700809.0, + "step": 9374 + }, + { + "epoch": 1.1925963617860322, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7390040159225464, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.874382495880127, + "num_tokens": 357733424.0, + "step": 9375 + }, + { + "epoch": 1.1927235720646228, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.682497262954712, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8675453066825867, + "num_tokens": 357767030.0, + "step": 9376 + }, + { + "epoch": 1.1928507823432133, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6428495645523071, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8649786710739136, + "num_tokens": 357807284.0, + "step": 9377 + }, + { + "epoch": 1.1929779926218038, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5078270435333252, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8718183040618896, + "num_tokens": 357850495.0, + "step": 9378 + }, + { + "epoch": 1.1931052029003943, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6208559274673462, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8614855408668518, + "num_tokens": 357889317.0, + "step": 9379 + }, + { + "epoch": 1.1932324131789849, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7326310873031616, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8676087856292725, + "num_tokens": 357922779.0, + "step": 9380 + }, + { + "epoch": 1.1933596234575754, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.4892388582229614, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8688743710517883, + "num_tokens": 357965083.0, + "step": 9381 + }, + { + "epoch": 1.193486833736166, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5617998838424683, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8918452262878418, + "num_tokens": 358002722.0, + "step": 9382 + }, + { + "epoch": 1.1936140440147565, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.656205415725708, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8689255714416504, + "num_tokens": 358039581.0, + "step": 9383 + }, + { + "epoch": 1.193741254293347, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7282379865646362, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8610439896583557, + "num_tokens": 358073216.0, + "step": 9384 + }, + { + "epoch": 1.1938684645719375, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5056244134902954, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.873599648475647, + "num_tokens": 358112777.0, + "step": 9385 + }, + { + "epoch": 1.193995674850528, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.774399995803833, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8600914478302002, + "num_tokens": 358146730.0, + "step": 9386 + }, + { + "epoch": 1.1941228851291183, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6912307739257812, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8775554299354553, + "num_tokens": 358184973.0, + "step": 9387 + }, + { + "epoch": 1.1942500954077089, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.7636741399765015, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8726996183395386, + "num_tokens": 358216904.0, + "step": 9388 + }, + { + "epoch": 1.1943773056862994, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.552282691001892, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.863814115524292, + "num_tokens": 358257060.0, + "step": 9389 + }, + { + "epoch": 1.19450451596489, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6286004781723022, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.880259096622467, + "num_tokens": 358295461.0, + "step": 9390 + }, + { + "epoch": 1.1946317262434805, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6492946147918701, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8718528747558594, + "num_tokens": 358335484.0, + "step": 9391 + }, + { + "epoch": 1.194758936522071, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.613221526145935, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8744885921478271, + "num_tokens": 358373871.0, + "step": 9392 + }, + { + "epoch": 1.1948861468006615, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5561140775680542, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8608977794647217, + "num_tokens": 358415529.0, + "step": 9393 + }, + { + "epoch": 1.195013357079252, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.595916748046875, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8732219934463501, + "num_tokens": 358454446.0, + "step": 9394 + }, + { + "epoch": 1.1951405673578426, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5881145000457764, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8644035458564758, + "num_tokens": 358494283.0, + "step": 9395 + }, + { + "epoch": 1.195267777636433, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.6178803443908691, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8817445039749146, + "num_tokens": 358529556.0, + "step": 9396 + }, + { + "epoch": 1.1953949879150234, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6111173629760742, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8752102851867676, + "num_tokens": 358566575.0, + "step": 9397 + }, + { + "epoch": 1.195522198193614, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7180571556091309, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8736613988876343, + "num_tokens": 358600733.0, + "step": 9398 + }, + { + "epoch": 1.1956494084722045, + "ewc_loss": 2.1457672119140625e-05, + "grad_norm": 1.5610263347625732, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8695429563522339, + "num_tokens": 358643004.0, + "step": 9399 + }, + { + "epoch": 1.195776618750795, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5202000141143799, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8655827045440674, + "num_tokens": 358685832.0, + "step": 9400 + }, + { + "epoch": 1.1959038290293855, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5353496074676514, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8671293258666992, + "num_tokens": 358729686.0, + "step": 9401 + }, + { + "epoch": 1.196031039307976, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5788663625717163, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8774322867393494, + "num_tokens": 358767562.0, + "step": 9402 + }, + { + "epoch": 1.1961582495865666, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6428320407867432, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8705689311027527, + "num_tokens": 358799961.0, + "step": 9403 + }, + { + "epoch": 1.196285459865157, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.711928367614746, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8668877482414246, + "num_tokens": 358839501.0, + "step": 9404 + }, + { + "epoch": 1.1964126701437476, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5866037607192993, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8658154010772705, + "num_tokens": 358879671.0, + "step": 9405 + }, + { + "epoch": 1.1965398804223382, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6803228855133057, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8803674578666687, + "num_tokens": 358914710.0, + "step": 9406 + }, + { + "epoch": 1.1966670907009287, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.787935495376587, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8686736822128296, + "num_tokens": 358947493.0, + "step": 9407 + }, + { + "epoch": 1.1967943009795192, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5349003076553345, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.873300313949585, + "num_tokens": 358988139.0, + "step": 9408 + }, + { + "epoch": 1.1969215112581097, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.575140118598938, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8786498308181763, + "num_tokens": 359025721.0, + "step": 9409 + }, + { + "epoch": 1.1970487215367003, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5282249450683594, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8621727824211121, + "num_tokens": 359070489.0, + "step": 9410 + }, + { + "epoch": 1.1971759318152906, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5910807847976685, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8835536241531372, + "num_tokens": 359110846.0, + "step": 9411 + }, + { + "epoch": 1.1973031420938811, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.4554778337478638, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8786435127258301, + "num_tokens": 359154151.0, + "step": 9412 + }, + { + "epoch": 1.1974303523724716, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.639220118522644, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8860278129577637, + "num_tokens": 359188175.0, + "step": 9413 + }, + { + "epoch": 1.1975575626510622, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6056606769561768, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8667869567871094, + "num_tokens": 359227532.0, + "step": 9414 + }, + { + "epoch": 1.1976847729296527, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7063655853271484, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8570164442062378, + "num_tokens": 359263011.0, + "step": 9415 + }, + { + "epoch": 1.1978119832082432, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.600450873374939, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8738993406295776, + "num_tokens": 359301214.0, + "step": 9416 + }, + { + "epoch": 1.1979391934868338, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6935410499572754, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8448554277420044, + "num_tokens": 359342338.0, + "step": 9417 + }, + { + "epoch": 1.1980664037654243, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5676039457321167, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8777724504470825, + "num_tokens": 359384031.0, + "step": 9418 + }, + { + "epoch": 1.1981936140440148, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.8191009759902954, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8688724040985107, + "num_tokens": 359415847.0, + "step": 9419 + }, + { + "epoch": 1.1983208243226053, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5922279357910156, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.876823902130127, + "num_tokens": 359456183.0, + "step": 9420 + }, + { + "epoch": 1.1984480346011959, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7708942890167236, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8813197612762451, + "num_tokens": 359487500.0, + "step": 9421 + }, + { + "epoch": 1.1985752448797862, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6847480535507202, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8636678457260132, + "num_tokens": 359526212.0, + "step": 9422 + }, + { + "epoch": 1.1987024551583767, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.804801344871521, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8549490571022034, + "num_tokens": 359560129.0, + "step": 9423 + }, + { + "epoch": 1.1988296654369672, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5444731712341309, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8624358773231506, + "num_tokens": 359602538.0, + "step": 9424 + }, + { + "epoch": 1.1989568757155578, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7087340354919434, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8743588924407959, + "num_tokens": 359636750.0, + "step": 9425 + }, + { + "epoch": 1.1990840859941483, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6047561168670654, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8638197183609009, + "num_tokens": 359676494.0, + "step": 9426 + }, + { + "epoch": 1.1992112962727388, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6220753192901611, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8760312795639038, + "num_tokens": 359713078.0, + "step": 9427 + }, + { + "epoch": 1.1993385065513293, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6854914426803589, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8652650117874146, + "num_tokens": 359749989.0, + "step": 9428 + }, + { + "epoch": 1.1994657168299199, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6373025178909302, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8713948726654053, + "num_tokens": 359784029.0, + "step": 9429 + }, + { + "epoch": 1.1995929271085104, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.580448031425476, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8669098019599915, + "num_tokens": 359824682.0, + "step": 9430 + }, + { + "epoch": 1.199720137387101, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6775423288345337, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8792413473129272, + "num_tokens": 359865048.0, + "step": 9431 + }, + { + "epoch": 1.1998473476656915, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.4812061786651611, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8711365461349487, + "num_tokens": 359907581.0, + "step": 9432 + }, + { + "epoch": 1.199974557944282, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6033070087432861, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8708857297897339, + "num_tokens": 359952438.0, + "step": 9433 + }, + { + "epoch": 1.2001017682228725, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6729522943496704, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8682969212532043, + "num_tokens": 359990303.0, + "step": 9434 + }, + { + "epoch": 1.200228978501463, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6554005146026611, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8622989058494568, + "num_tokens": 360028578.0, + "step": 9435 + }, + { + "epoch": 1.2003561887800533, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.584731936454773, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8790249228477478, + "num_tokens": 360066196.0, + "step": 9436 + }, + { + "epoch": 1.2004833990586439, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7558757066726685, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8716997504234314, + "num_tokens": 360098906.0, + "step": 9437 + }, + { + "epoch": 1.2006106093372344, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6538797616958618, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8787517547607422, + "num_tokens": 360133219.0, + "step": 9438 + }, + { + "epoch": 1.200737819615825, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6610857248306274, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8650844693183899, + "num_tokens": 360168656.0, + "step": 9439 + }, + { + "epoch": 1.2008650298944155, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5871102809906006, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8700789213180542, + "num_tokens": 360206550.0, + "step": 9440 + }, + { + "epoch": 1.200992240173006, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.650650978088379, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8651970028877258, + "num_tokens": 360243133.0, + "step": 9441 + }, + { + "epoch": 1.2011194504515965, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5400375127792358, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.87436842918396, + "num_tokens": 360281830.0, + "step": 9442 + }, + { + "epoch": 1.201246660730187, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.657944917678833, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.864363968372345, + "num_tokens": 360321720.0, + "step": 9443 + }, + { + "epoch": 1.2013738710087776, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5001803636550903, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8706540465354919, + "num_tokens": 360367760.0, + "step": 9444 + }, + { + "epoch": 1.201501081287368, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7356516122817993, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8682857155799866, + "num_tokens": 360403035.0, + "step": 9445 + }, + { + "epoch": 1.2016282915659584, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6584805250167847, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8672894835472107, + "num_tokens": 360438766.0, + "step": 9446 + }, + { + "epoch": 1.201755501844549, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7673128843307495, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8615900278091431, + "num_tokens": 360471656.0, + "step": 9447 + }, + { + "epoch": 1.2018827121231395, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.8331074714660645, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8445038795471191, + "num_tokens": 360504368.0, + "step": 9448 + }, + { + "epoch": 1.20200992240173, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5604885816574097, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8710936903953552, + "num_tokens": 360545734.0, + "step": 9449 + }, + { + "epoch": 1.2021371326803205, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6375693082809448, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8707361817359924, + "num_tokens": 360578765.0, + "step": 9450 + }, + { + "epoch": 1.202264342958911, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6236335039138794, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8657865524291992, + "num_tokens": 360618681.0, + "step": 9451 + }, + { + "epoch": 1.2023915532375016, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5604090690612793, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8613746166229248, + "num_tokens": 360657540.0, + "step": 9452 + }, + { + "epoch": 1.202518763516092, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6169084310531616, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8655903339385986, + "num_tokens": 360696788.0, + "step": 9453 + }, + { + "epoch": 1.2026459737946826, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.4919782876968384, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8868014812469482, + "num_tokens": 360733688.0, + "step": 9454 + }, + { + "epoch": 1.2027731840732732, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5998408794403076, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8756814002990723, + "num_tokens": 360767719.0, + "step": 9455 + }, + { + "epoch": 1.2029003943518637, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7221767902374268, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8597056269645691, + "num_tokens": 360802893.0, + "step": 9456 + }, + { + "epoch": 1.2030276046304542, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.725059151649475, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.859046995639801, + "num_tokens": 360838637.0, + "step": 9457 + }, + { + "epoch": 1.2031548149090447, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.72579026222229, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8748394250869751, + "num_tokens": 360874477.0, + "step": 9458 + }, + { + "epoch": 1.2032820251876353, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5885175466537476, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8703804612159729, + "num_tokens": 360909234.0, + "step": 9459 + }, + { + "epoch": 1.2034092354662256, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5424244403839111, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8801326155662537, + "num_tokens": 360948572.0, + "step": 9460 + }, + { + "epoch": 1.203536445744816, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6377897262573242, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8788169026374817, + "num_tokens": 360984033.0, + "step": 9461 + }, + { + "epoch": 1.2036636560234066, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.699485421180725, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8569737672805786, + "num_tokens": 361021027.0, + "step": 9462 + }, + { + "epoch": 1.2037908663019972, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6301673650741577, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8857264518737793, + "num_tokens": 361061801.0, + "step": 9463 + }, + { + "epoch": 1.2039180765805877, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7097431421279907, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.876262366771698, + "num_tokens": 361098698.0, + "step": 9464 + }, + { + "epoch": 1.2040452868591782, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6882843971252441, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8681474924087524, + "num_tokens": 361136612.0, + "step": 9465 + }, + { + "epoch": 1.2041724971377687, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.620945692062378, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8799368143081665, + "num_tokens": 361172359.0, + "step": 9466 + }, + { + "epoch": 1.2042997074163593, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5621142387390137, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8936343193054199, + "num_tokens": 361210668.0, + "step": 9467 + }, + { + "epoch": 1.2044269176949498, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6446521282196045, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8598816990852356, + "num_tokens": 361250661.0, + "step": 9468 + }, + { + "epoch": 1.2045541279735403, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6475952863693237, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8721600770950317, + "num_tokens": 361289090.0, + "step": 9469 + }, + { + "epoch": 1.2046813382521309, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7315179109573364, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8628612160682678, + "num_tokens": 361326952.0, + "step": 9470 + }, + { + "epoch": 1.2048085485307212, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6246392726898193, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.886258602142334, + "num_tokens": 361360226.0, + "step": 9471 + }, + { + "epoch": 1.2049357588093117, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.557059407234192, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8810660243034363, + "num_tokens": 361401986.0, + "step": 9472 + }, + { + "epoch": 1.2050629690879022, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5523861646652222, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8768035173416138, + "num_tokens": 361442419.0, + "step": 9473 + }, + { + "epoch": 1.2051901793664928, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5393248796463013, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8741178512573242, + "num_tokens": 361483591.0, + "step": 9474 + }, + { + "epoch": 1.2053173896450833, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.562985897064209, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.874748706817627, + "num_tokens": 361518923.0, + "step": 9475 + }, + { + "epoch": 1.2054445999236738, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6116465330123901, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8738824725151062, + "num_tokens": 361554797.0, + "step": 9476 + }, + { + "epoch": 1.2055718102022643, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6464152336120605, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8721789121627808, + "num_tokens": 361590652.0, + "step": 9477 + }, + { + "epoch": 1.2056990204808549, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.655970811843872, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8639136552810669, + "num_tokens": 361632995.0, + "step": 9478 + }, + { + "epoch": 1.2058262307594454, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5074366331100464, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8859212398529053, + "num_tokens": 361669594.0, + "step": 9479 + }, + { + "epoch": 1.205953441038036, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.663532018661499, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8640263080596924, + "num_tokens": 361709651.0, + "step": 9480 + }, + { + "epoch": 1.2060806513166265, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6267114877700806, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8568671941757202, + "num_tokens": 361750582.0, + "step": 9481 + }, + { + "epoch": 1.206207861595217, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.537614345550537, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.864401638507843, + "num_tokens": 361795217.0, + "step": 9482 + }, + { + "epoch": 1.2063350718738075, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6288343667984009, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8743184804916382, + "num_tokens": 361832152.0, + "step": 9483 + }, + { + "epoch": 1.206462282152398, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 17.043825149536133, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8508737087249756, + "num_tokens": 361872762.0, + "step": 9484 + }, + { + "epoch": 1.2065894924309883, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5808014869689941, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8813048005104065, + "num_tokens": 361916845.0, + "step": 9485 + }, + { + "epoch": 1.2067167027095789, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7647593021392822, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.871108889579773, + "num_tokens": 361951853.0, + "step": 9486 + }, + { + "epoch": 1.2068439129881694, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6449735164642334, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8638750910758972, + "num_tokens": 361992282.0, + "step": 9487 + }, + { + "epoch": 1.20697112326676, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.8491055965423584, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.863803505897522, + "num_tokens": 362022824.0, + "step": 9488 + }, + { + "epoch": 1.2070983335453505, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6201233863830566, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8843922019004822, + "num_tokens": 362058186.0, + "step": 9489 + }, + { + "epoch": 1.207225543823941, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5245954990386963, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8830123543739319, + "num_tokens": 362099579.0, + "step": 9490 + }, + { + "epoch": 1.2073527541025315, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.8430713415145874, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8645204305648804, + "num_tokens": 362133174.0, + "step": 9491 + }, + { + "epoch": 1.207479964381122, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.704673409461975, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8781580924987793, + "num_tokens": 362172823.0, + "step": 9492 + }, + { + "epoch": 1.2076071746597126, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7673263549804688, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8630528450012207, + "num_tokens": 362209754.0, + "step": 9493 + }, + { + "epoch": 1.207734384938303, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4668902158737183, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8760842084884644, + "num_tokens": 362253065.0, + "step": 9494 + }, + { + "epoch": 1.2078615952168934, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7216355800628662, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8729499578475952, + "num_tokens": 362291067.0, + "step": 9495 + }, + { + "epoch": 1.207988805495484, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7568607330322266, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8609827160835266, + "num_tokens": 362323233.0, + "step": 9496 + }, + { + "epoch": 1.2081160157740745, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7131491899490356, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8513035774230957, + "num_tokens": 362365025.0, + "step": 9497 + }, + { + "epoch": 1.208243226052665, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.939258098602295, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.878394603729248, + "num_tokens": 362396484.0, + "step": 9498 + }, + { + "epoch": 1.2083704363312555, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.555298089981079, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8668326139450073, + "num_tokens": 362440484.0, + "step": 9499 + }, + { + "epoch": 1.208497646609846, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6093060970306396, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.869484543800354, + "num_tokens": 362480525.0, + "step": 9500 + }, + { + "epoch": 1.2086248568884366, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6109033823013306, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8672882318496704, + "num_tokens": 362518231.0, + "step": 9501 + }, + { + "epoch": 1.208752067167027, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6107993125915527, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8770590424537659, + "num_tokens": 362555805.0, + "step": 9502 + }, + { + "epoch": 1.2088792774456176, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.614019751548767, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8445018529891968, + "num_tokens": 362597057.0, + "step": 9503 + }, + { + "epoch": 1.2090064877242082, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.683559775352478, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8646330237388611, + "num_tokens": 362635708.0, + "step": 9504 + }, + { + "epoch": 1.2091336980027987, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.539684772491455, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8660920858383179, + "num_tokens": 362676451.0, + "step": 9505 + }, + { + "epoch": 1.2092609082813892, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7050195932388306, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8665989637374878, + "num_tokens": 362710641.0, + "step": 9506 + }, + { + "epoch": 1.2093881185599797, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6288148164749146, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8783541917800903, + "num_tokens": 362746545.0, + "step": 9507 + }, + { + "epoch": 1.2095153288385703, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5407589673995972, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8685251474380493, + "num_tokens": 362789268.0, + "step": 9508 + }, + { + "epoch": 1.2096425391171606, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4101788997650146, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8822457194328308, + "num_tokens": 362833962.0, + "step": 9509 + }, + { + "epoch": 1.209769749395751, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.518131971359253, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8707069754600525, + "num_tokens": 362875102.0, + "step": 9510 + }, + { + "epoch": 1.2098969596743416, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.636448860168457, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8710049390792847, + "num_tokens": 362911721.0, + "step": 9511 + }, + { + "epoch": 1.2100241699529322, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5429282188415527, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8681487441062927, + "num_tokens": 362949756.0, + "step": 9512 + }, + { + "epoch": 1.2101513802315227, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7721925973892212, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8611773252487183, + "num_tokens": 362983179.0, + "step": 9513 + }, + { + "epoch": 1.2102785905101132, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5250171422958374, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8613924980163574, + "num_tokens": 363023154.0, + "step": 9514 + }, + { + "epoch": 1.2104058007887037, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5690702199935913, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8673396110534668, + "num_tokens": 363063351.0, + "step": 9515 + }, + { + "epoch": 1.2105330110672943, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6285258531570435, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8675382137298584, + "num_tokens": 363101393.0, + "step": 9516 + }, + { + "epoch": 1.2106602213458848, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6618326902389526, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8498287200927734, + "num_tokens": 363141365.0, + "step": 9517 + }, + { + "epoch": 1.2107874316244753, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6378791332244873, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8847062587738037, + "num_tokens": 363177422.0, + "step": 9518 + }, + { + "epoch": 1.2109146419030659, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7423747777938843, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.861121416091919, + "num_tokens": 363210969.0, + "step": 9519 + }, + { + "epoch": 1.2110418521816562, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.635353922843933, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8673796653747559, + "num_tokens": 363248210.0, + "step": 9520 + }, + { + "epoch": 1.2111690624602467, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7191704511642456, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8795091509819031, + "num_tokens": 363283992.0, + "step": 9521 + }, + { + "epoch": 1.2112962727388372, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.696529507637024, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8679083585739136, + "num_tokens": 363319121.0, + "step": 9522 + }, + { + "epoch": 1.2114234830174277, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5568602085113525, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8749478459358215, + "num_tokens": 363356880.0, + "step": 9523 + }, + { + "epoch": 1.2115506932960183, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4284253120422363, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8924386501312256, + "num_tokens": 363395981.0, + "step": 9524 + }, + { + "epoch": 1.2116779035746088, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6243441104888916, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.862178385257721, + "num_tokens": 363432334.0, + "step": 9525 + }, + { + "epoch": 1.2118051138531993, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.62583327293396, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8875714540481567, + "num_tokens": 363466970.0, + "step": 9526 + }, + { + "epoch": 1.2119323241317899, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6404659748077393, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.873702883720398, + "num_tokens": 363500405.0, + "step": 9527 + }, + { + "epoch": 1.2120595344103804, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.8066155910491943, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.865990161895752, + "num_tokens": 363530264.0, + "step": 9528 + }, + { + "epoch": 1.212186744688971, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5066255331039429, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8712924122810364, + "num_tokens": 363578626.0, + "step": 9529 + }, + { + "epoch": 1.2123139549675614, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6567387580871582, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8610091805458069, + "num_tokens": 363617500.0, + "step": 9530 + }, + { + "epoch": 1.212441165246152, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6459132432937622, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8775986433029175, + "num_tokens": 363654293.0, + "step": 9531 + }, + { + "epoch": 1.2125683755247425, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.62522554397583, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8719109296798706, + "num_tokens": 363690998.0, + "step": 9532 + }, + { + "epoch": 1.212695585803333, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6725133657455444, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8641475439071655, + "num_tokens": 363726832.0, + "step": 9533 + }, + { + "epoch": 1.2128227960819233, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6913057565689087, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8748841285705566, + "num_tokens": 363765262.0, + "step": 9534 + }, + { + "epoch": 1.2129500063605139, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6106960773468018, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.852149486541748, + "num_tokens": 363808095.0, + "step": 9535 + }, + { + "epoch": 1.2130772166391044, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.514030933380127, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8721228837966919, + "num_tokens": 363848400.0, + "step": 9536 + }, + { + "epoch": 1.213204426917695, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6833012104034424, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8635963797569275, + "num_tokens": 363882478.0, + "step": 9537 + }, + { + "epoch": 1.2133316371962855, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4521427154541016, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8899453282356262, + "num_tokens": 363921364.0, + "step": 9538 + }, + { + "epoch": 1.213458847474876, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5624022483825684, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8681470155715942, + "num_tokens": 363963022.0, + "step": 9539 + }, + { + "epoch": 1.2135860577534665, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5824276208877563, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8559117913246155, + "num_tokens": 364002631.0, + "step": 9540 + }, + { + "epoch": 1.213713268032057, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.8199985027313232, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8711018562316895, + "num_tokens": 364034762.0, + "step": 9541 + }, + { + "epoch": 1.2138404783106476, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5594775676727295, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8756577968597412, + "num_tokens": 364077147.0, + "step": 9542 + }, + { + "epoch": 1.213967688589238, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6953742504119873, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8532611131668091, + "num_tokens": 364117008.0, + "step": 9543 + }, + { + "epoch": 1.2140948988678284, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6653023958206177, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8579891920089722, + "num_tokens": 364156875.0, + "step": 9544 + }, + { + "epoch": 1.214222109146419, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5674461126327515, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8818559050559998, + "num_tokens": 364196171.0, + "step": 9545 + }, + { + "epoch": 1.2143493194250095, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4826815128326416, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8812237977981567, + "num_tokens": 364239713.0, + "step": 9546 + }, + { + "epoch": 1.2144765297036, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5276683568954468, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8753925561904907, + "num_tokens": 364280970.0, + "step": 9547 + }, + { + "epoch": 1.2146037399821905, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6158101558685303, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.866904616355896, + "num_tokens": 364315601.0, + "step": 9548 + }, + { + "epoch": 1.214730950260781, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6264704465866089, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8759229779243469, + "num_tokens": 364352700.0, + "step": 9549 + }, + { + "epoch": 1.2148581605393716, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5099527835845947, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.866715669631958, + "num_tokens": 364397150.0, + "step": 9550 + }, + { + "epoch": 1.214985370817962, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5863149166107178, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8634176850318909, + "num_tokens": 364438910.0, + "step": 9551 + }, + { + "epoch": 1.2151125810965526, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5130987167358398, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8617227077484131, + "num_tokens": 364479436.0, + "step": 9552 + }, + { + "epoch": 1.2152397913751432, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4593569040298462, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8605213761329651, + "num_tokens": 364525284.0, + "step": 9553 + }, + { + "epoch": 1.2153670016537337, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4760111570358276, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8740214109420776, + "num_tokens": 364567259.0, + "step": 9554 + }, + { + "epoch": 1.2154942119323242, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6395752429962158, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8706868290901184, + "num_tokens": 364603596.0, + "step": 9555 + }, + { + "epoch": 1.2156214222109147, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6635351181030273, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.867551863193512, + "num_tokens": 364642434.0, + "step": 9556 + }, + { + "epoch": 1.2157486324895053, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.687495231628418, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8597269654273987, + "num_tokens": 364683359.0, + "step": 9557 + }, + { + "epoch": 1.2158758427680956, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.57857346534729, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8646575212478638, + "num_tokens": 364725747.0, + "step": 9558 + }, + { + "epoch": 1.216003053046686, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5920228958129883, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.876606285572052, + "num_tokens": 364765122.0, + "step": 9559 + }, + { + "epoch": 1.2161302633252766, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5868452787399292, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8556936979293823, + "num_tokens": 364809478.0, + "step": 9560 + }, + { + "epoch": 1.2162574736038672, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5654209852218628, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.873517632484436, + "num_tokens": 364849242.0, + "step": 9561 + }, + { + "epoch": 1.2163846838824577, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6094154119491577, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.862053632736206, + "num_tokens": 364889326.0, + "step": 9562 + }, + { + "epoch": 1.2165118941610482, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6039252281188965, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8653465509414673, + "num_tokens": 364926199.0, + "step": 9563 + }, + { + "epoch": 1.2166391044396387, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6058125495910645, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.855048656463623, + "num_tokens": 364966880.0, + "step": 9564 + }, + { + "epoch": 1.2167663147182293, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7062782049179077, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8580100536346436, + "num_tokens": 365002492.0, + "step": 9565 + }, + { + "epoch": 1.2168935249968198, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4967279434204102, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.867480993270874, + "num_tokens": 365047497.0, + "step": 9566 + }, + { + "epoch": 1.2170207352754103, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7789860963821411, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8529314398765564, + "num_tokens": 365078942.0, + "step": 9567 + }, + { + "epoch": 1.2171479455540009, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.8040177822113037, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.866303563117981, + "num_tokens": 365110405.0, + "step": 9568 + }, + { + "epoch": 1.2172751558325912, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5659986734390259, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.879654049873352, + "num_tokens": 365145720.0, + "step": 9569 + }, + { + "epoch": 1.2174023661111817, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5577430725097656, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8588509559631348, + "num_tokens": 365185533.0, + "step": 9570 + }, + { + "epoch": 1.2175295763897722, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4932799339294434, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8571262359619141, + "num_tokens": 365232597.0, + "step": 9571 + }, + { + "epoch": 1.2176567866683627, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7122266292572021, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8497068881988525, + "num_tokens": 365268640.0, + "step": 9572 + }, + { + "epoch": 1.2177839969469533, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6316742897033691, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8778173923492432, + "num_tokens": 365303624.0, + "step": 9573 + }, + { + "epoch": 1.2179112072255438, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.4372859001159668, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8756483793258667, + "num_tokens": 365349563.0, + "step": 9574 + }, + { + "epoch": 1.2180384175041343, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6001222133636475, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8606439232826233, + "num_tokens": 365387945.0, + "step": 9575 + }, + { + "epoch": 1.2181656277827249, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5961486101150513, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8872222900390625, + "num_tokens": 365422771.0, + "step": 9576 + }, + { + "epoch": 1.2182928380613154, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.860095500946045, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8658783435821533, + "num_tokens": 365450745.0, + "step": 9577 + }, + { + "epoch": 1.218420048339906, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6473824977874756, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8702946305274963, + "num_tokens": 365490271.0, + "step": 9578 + }, + { + "epoch": 1.2185472586184964, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5415842533111572, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8660908341407776, + "num_tokens": 365533642.0, + "step": 9579 + }, + { + "epoch": 1.218674468897087, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.700555443763733, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.875190019607544, + "num_tokens": 365572541.0, + "step": 9580 + }, + { + "epoch": 1.2188016791756775, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5596939325332642, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8711011409759521, + "num_tokens": 365612337.0, + "step": 9581 + }, + { + "epoch": 1.218928889454268, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6983708143234253, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8647760152816772, + "num_tokens": 365649817.0, + "step": 9582 + }, + { + "epoch": 1.2190560997328583, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6576199531555176, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8669751882553101, + "num_tokens": 365685803.0, + "step": 9583 + }, + { + "epoch": 1.2191833100114489, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 3.7749733924865723, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8615714907646179, + "num_tokens": 365721802.0, + "step": 9584 + }, + { + "epoch": 1.2193105202900394, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.606461763381958, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8790357112884521, + "num_tokens": 365755597.0, + "step": 9585 + }, + { + "epoch": 1.21943773056863, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5052824020385742, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8733019232749939, + "num_tokens": 365795925.0, + "step": 9586 + }, + { + "epoch": 1.2195649408472204, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4834870100021362, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8737425804138184, + "num_tokens": 365839896.0, + "step": 9587 + }, + { + "epoch": 1.219692151125811, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7038904428482056, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8737154603004456, + "num_tokens": 365874118.0, + "step": 9588 + }, + { + "epoch": 1.2198193614044015, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6019867658615112, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8700144290924072, + "num_tokens": 365911949.0, + "step": 9589 + }, + { + "epoch": 1.219946571682992, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5650500059127808, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8697004318237305, + "num_tokens": 365953832.0, + "step": 9590 + }, + { + "epoch": 1.2200737819615826, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6421769857406616, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.867271900177002, + "num_tokens": 365992212.0, + "step": 9591 + }, + { + "epoch": 1.220200992240173, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.439431071281433, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8659799098968506, + "num_tokens": 366037893.0, + "step": 9592 + }, + { + "epoch": 1.2203282025187634, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5976974964141846, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8838757872581482, + "num_tokens": 366076874.0, + "step": 9593 + }, + { + "epoch": 1.220455412797354, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6487140655517578, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8780630826950073, + "num_tokens": 366112568.0, + "step": 9594 + }, + { + "epoch": 1.2205826230759445, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.5641690492630005, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8787161111831665, + "num_tokens": 366150134.0, + "step": 9595 + }, + { + "epoch": 1.220709833354535, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7053861618041992, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8503637313842773, + "num_tokens": 366190406.0, + "step": 9596 + }, + { + "epoch": 1.2208370436331255, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.7508260011672974, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8564330339431763, + "num_tokens": 366231723.0, + "step": 9597 + }, + { + "epoch": 1.220964253911716, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.6135133504867554, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8592393398284912, + "num_tokens": 366271846.0, + "step": 9598 + }, + { + "epoch": 1.2210914641903066, + "ewc_loss": 2.1576881408691406e-05, + "grad_norm": 1.931239366531372, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8634108304977417, + "num_tokens": 366300909.0, + "step": 9599 + }, + { + "epoch": 1.221218674468897, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5597865581512451, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8727945685386658, + "num_tokens": 366341504.0, + "step": 9600 + }, + { + "epoch": 1.2213458847474876, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6778028011322021, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.862322211265564, + "num_tokens": 366382231.0, + "step": 9601 + }, + { + "epoch": 1.2214730950260781, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5808476209640503, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8851162195205688, + "num_tokens": 366427441.0, + "step": 9602 + }, + { + "epoch": 1.2216003053046687, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6594901084899902, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8709781169891357, + "num_tokens": 366466427.0, + "step": 9603 + }, + { + "epoch": 1.2217275155832592, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.532224178314209, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8760692477226257, + "num_tokens": 366508779.0, + "step": 9604 + }, + { + "epoch": 1.2218547258618497, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6577012538909912, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8710049390792847, + "num_tokens": 366541733.0, + "step": 9605 + }, + { + "epoch": 1.2219819361404403, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 3.695537805557251, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8671441078186035, + "num_tokens": 366580544.0, + "step": 9606 + }, + { + "epoch": 1.2221091464190306, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7798739671707153, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8879721164703369, + "num_tokens": 366614742.0, + "step": 9607 + }, + { + "epoch": 1.222236356697621, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6968638896942139, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8832762241363525, + "num_tokens": 366652268.0, + "step": 9608 + }, + { + "epoch": 1.2223635669762116, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6351014375686646, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8698654174804688, + "num_tokens": 366690733.0, + "step": 9609 + }, + { + "epoch": 1.2224907772548022, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.494968295097351, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.883737325668335, + "num_tokens": 366730311.0, + "step": 9610 + }, + { + "epoch": 1.2226179875333927, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.809711217880249, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8724372386932373, + "num_tokens": 366765237.0, + "step": 9611 + }, + { + "epoch": 1.2227451978119832, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7820098400115967, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8585872650146484, + "num_tokens": 366802984.0, + "step": 9612 + }, + { + "epoch": 1.2228724080905737, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.68724524974823, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8672458529472351, + "num_tokens": 366840550.0, + "step": 9613 + }, + { + "epoch": 1.2229996183691643, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.719750165939331, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8667329549789429, + "num_tokens": 366874921.0, + "step": 9614 + }, + { + "epoch": 1.2231268286477548, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5908787250518799, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8776666522026062, + "num_tokens": 366914007.0, + "step": 9615 + }, + { + "epoch": 1.2232540389263453, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5214675664901733, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8781973719596863, + "num_tokens": 366956760.0, + "step": 9616 + }, + { + "epoch": 1.2233812492049359, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5708909034729004, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8833997845649719, + "num_tokens": 366999554.0, + "step": 9617 + }, + { + "epoch": 1.2235084594835262, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5648744106292725, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8708348870277405, + "num_tokens": 367039368.0, + "step": 9618 + }, + { + "epoch": 1.2236356697621167, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5292305946350098, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8722044229507446, + "num_tokens": 367081812.0, + "step": 9619 + }, + { + "epoch": 1.2237628800407072, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4744367599487305, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8807125091552734, + "num_tokens": 367124063.0, + "step": 9620 + }, + { + "epoch": 1.2238900903192977, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6278294324874878, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8618639707565308, + "num_tokens": 367160800.0, + "step": 9621 + }, + { + "epoch": 1.2240173005978883, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.686241626739502, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8532639741897583, + "num_tokens": 367197669.0, + "step": 9622 + }, + { + "epoch": 1.2241445108764788, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7380717992782593, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8751515746116638, + "num_tokens": 367233465.0, + "step": 9623 + }, + { + "epoch": 1.2242717211550693, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7937875986099243, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8646490573883057, + "num_tokens": 367264694.0, + "step": 9624 + }, + { + "epoch": 1.2243989314336599, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7392804622650146, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8665697574615479, + "num_tokens": 367298912.0, + "step": 9625 + }, + { + "epoch": 1.2245261417122504, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6111232042312622, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8724521398544312, + "num_tokens": 367339000.0, + "step": 9626 + }, + { + "epoch": 1.224653351990841, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6325405836105347, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8642115592956543, + "num_tokens": 367380008.0, + "step": 9627 + }, + { + "epoch": 1.2247805622694314, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6311805248260498, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8680008053779602, + "num_tokens": 367416144.0, + "step": 9628 + }, + { + "epoch": 1.224907772548022, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6720218658447266, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8635275363922119, + "num_tokens": 367456595.0, + "step": 9629 + }, + { + "epoch": 1.2250349828266125, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.627263069152832, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8737035393714905, + "num_tokens": 367489836.0, + "step": 9630 + }, + { + "epoch": 1.225162193105203, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6408296823501587, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8630151748657227, + "num_tokens": 367528834.0, + "step": 9631 + }, + { + "epoch": 1.2252894033837933, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.650199294090271, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.868992030620575, + "num_tokens": 367564728.0, + "step": 9632 + }, + { + "epoch": 1.2254166136623839, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7402616739273071, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8601255416870117, + "num_tokens": 367599314.0, + "step": 9633 + }, + { + "epoch": 1.2255438239409744, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5432645082473755, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8645666837692261, + "num_tokens": 367641275.0, + "step": 9634 + }, + { + "epoch": 1.225671034219565, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.542049527168274, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8761624097824097, + "num_tokens": 367683391.0, + "step": 9635 + }, + { + "epoch": 1.2257982444981554, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6535965204238892, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8684884309768677, + "num_tokens": 367719668.0, + "step": 9636 + }, + { + "epoch": 1.225925454776746, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6478768587112427, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8732236623764038, + "num_tokens": 367758796.0, + "step": 9637 + }, + { + "epoch": 1.2260526650553365, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6590903997421265, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8668926954269409, + "num_tokens": 367795104.0, + "step": 9638 + }, + { + "epoch": 1.226179875333927, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5636966228485107, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8603659868240356, + "num_tokens": 367836486.0, + "step": 9639 + }, + { + "epoch": 1.2263070856125176, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.705198049545288, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8463534116744995, + "num_tokens": 367872361.0, + "step": 9640 + }, + { + "epoch": 1.226434295891108, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7738937139511108, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8704724311828613, + "num_tokens": 367904339.0, + "step": 9641 + }, + { + "epoch": 1.2265615061696984, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.782538890838623, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.859272837638855, + "num_tokens": 367937438.0, + "step": 9642 + }, + { + "epoch": 1.226688716448289, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6065642833709717, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8828268051147461, + "num_tokens": 367976262.0, + "step": 9643 + }, + { + "epoch": 1.2268159267268794, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.8991080522537231, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8614333868026733, + "num_tokens": 368019708.0, + "step": 9644 + }, + { + "epoch": 1.22694313700547, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7252371311187744, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8633764982223511, + "num_tokens": 368058801.0, + "step": 9645 + }, + { + "epoch": 1.2270703472840605, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5154556035995483, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.878416895866394, + "num_tokens": 368098450.0, + "step": 9646 + }, + { + "epoch": 1.227197557562651, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.608250617980957, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8638008832931519, + "num_tokens": 368137734.0, + "step": 9647 + }, + { + "epoch": 1.2273247678412416, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6991653442382812, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8638408184051514, + "num_tokens": 368173281.0, + "step": 9648 + }, + { + "epoch": 1.227451978119832, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6381808519363403, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8757829070091248, + "num_tokens": 368213615.0, + "step": 9649 + }, + { + "epoch": 1.2275791883984226, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5668991804122925, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8834001421928406, + "num_tokens": 368249688.0, + "step": 9650 + }, + { + "epoch": 1.2277063986770131, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5190414190292358, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8731679916381836, + "num_tokens": 368292914.0, + "step": 9651 + }, + { + "epoch": 1.2278336089556037, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.759190559387207, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8478518724441528, + "num_tokens": 368333397.0, + "step": 9652 + }, + { + "epoch": 1.2279608192341942, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.787421703338623, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8709691762924194, + "num_tokens": 368367613.0, + "step": 9653 + }, + { + "epoch": 1.2280880295127847, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6503087282180786, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8625435829162598, + "num_tokens": 368407854.0, + "step": 9654 + }, + { + "epoch": 1.2282152397913753, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 2.0764689445495605, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.874590277671814, + "num_tokens": 368442156.0, + "step": 9655 + }, + { + "epoch": 1.2283424500699656, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6639201641082764, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8724372386932373, + "num_tokens": 368481617.0, + "step": 9656 + }, + { + "epoch": 1.228469660348556, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5822263956069946, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8807957172393799, + "num_tokens": 368523603.0, + "step": 9657 + }, + { + "epoch": 1.2285968706271466, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.563478708267212, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8735325336456299, + "num_tokens": 368564775.0, + "step": 9658 + }, + { + "epoch": 1.2287240809057371, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.704732060432434, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.847332775592804, + "num_tokens": 368606752.0, + "step": 9659 + }, + { + "epoch": 1.2288512911843277, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.624509334564209, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8757044672966003, + "num_tokens": 368641801.0, + "step": 9660 + }, + { + "epoch": 1.2289785014629182, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6055208444595337, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8726140260696411, + "num_tokens": 368676039.0, + "step": 9661 + }, + { + "epoch": 1.2291057117415087, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.570906400680542, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8720472455024719, + "num_tokens": 368715050.0, + "step": 9662 + }, + { + "epoch": 1.2292329220200993, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5854315757751465, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8888612985610962, + "num_tokens": 368750827.0, + "step": 9663 + }, + { + "epoch": 1.2293601322986898, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7676889896392822, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.865408182144165, + "num_tokens": 368789359.0, + "step": 9664 + }, + { + "epoch": 1.2294873425772803, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.774267554283142, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8702360987663269, + "num_tokens": 368820194.0, + "step": 9665 + }, + { + "epoch": 1.2296145528558708, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6795412302017212, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8642277717590332, + "num_tokens": 368857670.0, + "step": 9666 + }, + { + "epoch": 1.2297417631344612, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6228163242340088, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8794649839401245, + "num_tokens": 368892848.0, + "step": 9667 + }, + { + "epoch": 1.2298689734130517, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6731131076812744, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8726273775100708, + "num_tokens": 368928692.0, + "step": 9668 + }, + { + "epoch": 1.2299961836916422, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6718307733535767, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8524191379547119, + "num_tokens": 368967908.0, + "step": 9669 + }, + { + "epoch": 1.2301233939702327, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7315764427185059, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8609001636505127, + "num_tokens": 369006809.0, + "step": 9670 + }, + { + "epoch": 1.2302506042488233, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6203527450561523, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8773571252822876, + "num_tokens": 369044340.0, + "step": 9671 + }, + { + "epoch": 1.2303778145274138, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7029367685317993, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8772489428520203, + "num_tokens": 369074655.0, + "step": 9672 + }, + { + "epoch": 1.2305050248060043, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6726549863815308, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8715097308158875, + "num_tokens": 369111485.0, + "step": 9673 + }, + { + "epoch": 1.2306322350845948, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.609562873840332, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8835138082504272, + "num_tokens": 369152428.0, + "step": 9674 + }, + { + "epoch": 1.2307594453631854, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6577199697494507, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8776903748512268, + "num_tokens": 369186050.0, + "step": 9675 + }, + { + "epoch": 1.230886655641776, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.532744288444519, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8787175416946411, + "num_tokens": 369224017.0, + "step": 9676 + }, + { + "epoch": 1.2310138659203664, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6559077501296997, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8607679605484009, + "num_tokens": 369260588.0, + "step": 9677 + }, + { + "epoch": 1.231141076198957, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.742147445678711, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8814975023269653, + "num_tokens": 369295396.0, + "step": 9678 + }, + { + "epoch": 1.2312682864775475, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5168845653533936, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8711668252944946, + "num_tokens": 369340915.0, + "step": 9679 + }, + { + "epoch": 1.231395496756138, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6229203939437866, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8703317642211914, + "num_tokens": 369379629.0, + "step": 9680 + }, + { + "epoch": 1.2315227070347283, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.613815426826477, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8737250566482544, + "num_tokens": 369417844.0, + "step": 9681 + }, + { + "epoch": 1.2316499173133189, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6469231843948364, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.867853045463562, + "num_tokens": 369457939.0, + "step": 9682 + }, + { + "epoch": 1.2317771275919094, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.612689733505249, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8517912030220032, + "num_tokens": 369498840.0, + "step": 9683 + }, + { + "epoch": 1.2319043378705, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5641368627548218, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8588505387306213, + "num_tokens": 369539488.0, + "step": 9684 + }, + { + "epoch": 1.2320315481490904, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7278037071228027, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8601762056350708, + "num_tokens": 369573503.0, + "step": 9685 + }, + { + "epoch": 1.232158758427681, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4697033166885376, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8854737281799316, + "num_tokens": 369612423.0, + "step": 9686 + }, + { + "epoch": 1.2322859687062715, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6142044067382812, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8747345209121704, + "num_tokens": 369649735.0, + "step": 9687 + }, + { + "epoch": 1.232413178984862, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6709530353546143, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.85783851146698, + "num_tokens": 369688030.0, + "step": 9688 + }, + { + "epoch": 1.2325403892634526, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6263128519058228, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8609382510185242, + "num_tokens": 369725540.0, + "step": 9689 + }, + { + "epoch": 1.232667599542043, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.554525375366211, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8662159442901611, + "num_tokens": 369768991.0, + "step": 9690 + }, + { + "epoch": 1.2327948098206334, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6236743927001953, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8635689616203308, + "num_tokens": 369807195.0, + "step": 9691 + }, + { + "epoch": 1.232922020099224, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.847994327545166, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8687379360198975, + "num_tokens": 369837818.0, + "step": 9692 + }, + { + "epoch": 1.2330492303778144, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5965715646743774, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8520797491073608, + "num_tokens": 369882439.0, + "step": 9693 + }, + { + "epoch": 1.233176440656405, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5365041494369507, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8786007165908813, + "num_tokens": 369923222.0, + "step": 9694 + }, + { + "epoch": 1.2333036509349955, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5340728759765625, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8713406324386597, + "num_tokens": 369961761.0, + "step": 9695 + }, + { + "epoch": 1.233430861213586, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5397011041641235, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8858197927474976, + "num_tokens": 369994988.0, + "step": 9696 + }, + { + "epoch": 1.2335580714921766, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6450496912002563, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8640650510787964, + "num_tokens": 370032231.0, + "step": 9697 + }, + { + "epoch": 1.233685281770767, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6208754777908325, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8616777658462524, + "num_tokens": 370074704.0, + "step": 9698 + }, + { + "epoch": 1.2338124920493576, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7579741477966309, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8744484782218933, + "num_tokens": 370109256.0, + "step": 9699 + }, + { + "epoch": 1.2339397023279481, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4930951595306396, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8701840043067932, + "num_tokens": 370150512.0, + "step": 9700 + }, + { + "epoch": 1.2340669126065387, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6263716220855713, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8690094947814941, + "num_tokens": 370185477.0, + "step": 9701 + }, + { + "epoch": 1.2341941228851292, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.8896468877792358, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8672268390655518, + "num_tokens": 370214578.0, + "step": 9702 + }, + { + "epoch": 1.2343213331637197, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6443326473236084, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8582032918930054, + "num_tokens": 370257828.0, + "step": 9703 + }, + { + "epoch": 1.2344485434423103, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5896414518356323, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8744914531707764, + "num_tokens": 370296433.0, + "step": 9704 + }, + { + "epoch": 1.2345757537209006, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6024943590164185, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8735032081604004, + "num_tokens": 370331773.0, + "step": 9705 + }, + { + "epoch": 1.234702963999491, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5139093399047852, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8660526275634766, + "num_tokens": 370373303.0, + "step": 9706 + }, + { + "epoch": 1.2348301742780816, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5551508665084839, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8617570996284485, + "num_tokens": 370415469.0, + "step": 9707 + }, + { + "epoch": 1.2349573845566721, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5381991863250732, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8730535507202148, + "num_tokens": 370452919.0, + "step": 9708 + }, + { + "epoch": 1.2350845948352627, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5232123136520386, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8728632926940918, + "num_tokens": 370496675.0, + "step": 9709 + }, + { + "epoch": 1.2352118051138532, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6966925859451294, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8594867587089539, + "num_tokens": 370534461.0, + "step": 9710 + }, + { + "epoch": 1.2353390153924437, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5581026077270508, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8858414888381958, + "num_tokens": 370571620.0, + "step": 9711 + }, + { + "epoch": 1.2354662256710343, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6751289367675781, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8594055771827698, + "num_tokens": 370606901.0, + "step": 9712 + }, + { + "epoch": 1.2355934359496248, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6987196207046509, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8826625347137451, + "num_tokens": 370643464.0, + "step": 9713 + }, + { + "epoch": 1.2357206462282153, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6694793701171875, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8660403490066528, + "num_tokens": 370683861.0, + "step": 9714 + }, + { + "epoch": 1.2358478565068058, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6994115114212036, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8708911538124084, + "num_tokens": 370717827.0, + "step": 9715 + }, + { + "epoch": 1.2359750667853961, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5192745923995972, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8693352937698364, + "num_tokens": 370759165.0, + "step": 9716 + }, + { + "epoch": 1.2361022770639867, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6122373342514038, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8740878701210022, + "num_tokens": 370797769.0, + "step": 9717 + }, + { + "epoch": 1.2362294873425772, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6013953685760498, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8607000112533569, + "num_tokens": 370838111.0, + "step": 9718 + }, + { + "epoch": 1.2363566976211677, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7027227878570557, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8793805837631226, + "num_tokens": 370874996.0, + "step": 9719 + }, + { + "epoch": 1.2364839078997583, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5662510395050049, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8747158050537109, + "num_tokens": 370913721.0, + "step": 9720 + }, + { + "epoch": 1.2366111181783488, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.648931622505188, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8668927550315857, + "num_tokens": 370953754.0, + "step": 9721 + }, + { + "epoch": 1.2367383284569393, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5703446865081787, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8630555868148804, + "num_tokens": 370994907.0, + "step": 9722 + }, + { + "epoch": 1.2368655387355298, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5247191190719604, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8727602362632751, + "num_tokens": 371040989.0, + "step": 9723 + }, + { + "epoch": 1.2369927490141204, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6245090961456299, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8729467391967773, + "num_tokens": 371081041.0, + "step": 9724 + }, + { + "epoch": 1.237119959292711, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5115349292755127, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8735916614532471, + "num_tokens": 371122964.0, + "step": 9725 + }, + { + "epoch": 1.2372471695713014, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5606824159622192, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.874535083770752, + "num_tokens": 371158782.0, + "step": 9726 + }, + { + "epoch": 1.237374379849892, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5895575284957886, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8578948974609375, + "num_tokens": 371198500.0, + "step": 9727 + }, + { + "epoch": 1.2375015901284825, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6701998710632324, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8625556230545044, + "num_tokens": 371236949.0, + "step": 9728 + }, + { + "epoch": 1.237628800407073, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5927823781967163, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8851684927940369, + "num_tokens": 371275416.0, + "step": 9729 + }, + { + "epoch": 1.2377560106856633, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7071088552474976, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8809211850166321, + "num_tokens": 371309201.0, + "step": 9730 + }, + { + "epoch": 1.2378832209642538, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6925321817398071, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8668112754821777, + "num_tokens": 371346002.0, + "step": 9731 + }, + { + "epoch": 1.2380104312428444, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5767836570739746, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8528558015823364, + "num_tokens": 371388868.0, + "step": 9732 + }, + { + "epoch": 1.238137641521435, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.767323613166809, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8589438199996948, + "num_tokens": 371425980.0, + "step": 9733 + }, + { + "epoch": 1.2382648518000254, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 1.620219111442566, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8694323301315308, + "num_tokens": 371463404.0, + "step": 9734 + }, + { + "epoch": 1.238392062078616, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5919069051742554, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8731033205986023, + "num_tokens": 371501612.0, + "step": 9735 + }, + { + "epoch": 1.2385192723572065, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7304431200027466, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8478233814239502, + "num_tokens": 371541768.0, + "step": 9736 + }, + { + "epoch": 1.238646482635797, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.573676347732544, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8758065700531006, + "num_tokens": 371580810.0, + "step": 9737 + }, + { + "epoch": 1.2387736929143875, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5690624713897705, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8687447309494019, + "num_tokens": 371620466.0, + "step": 9738 + }, + { + "epoch": 1.238900903192978, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6885168552398682, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8628653287887573, + "num_tokens": 371656117.0, + "step": 9739 + }, + { + "epoch": 1.2390281134715684, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5658605098724365, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8807045221328735, + "num_tokens": 371691829.0, + "step": 9740 + }, + { + "epoch": 1.239155323750159, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4936546087265015, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8803558349609375, + "num_tokens": 371733568.0, + "step": 9741 + }, + { + "epoch": 1.2392825340287494, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5249277353286743, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8837758302688599, + "num_tokens": 371775331.0, + "step": 9742 + }, + { + "epoch": 1.23940974430734, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6332449913024902, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8590453863143921, + "num_tokens": 371815945.0, + "step": 9743 + }, + { + "epoch": 1.2395369545859305, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5574243068695068, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8702374696731567, + "num_tokens": 371855197.0, + "step": 9744 + }, + { + "epoch": 1.239664164864521, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6458892822265625, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8651540875434875, + "num_tokens": 371893882.0, + "step": 9745 + }, + { + "epoch": 1.2397913751431116, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6793702840805054, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8479208946228027, + "num_tokens": 371932694.0, + "step": 9746 + }, + { + "epoch": 1.239918585421702, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7840265035629272, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8696144819259644, + "num_tokens": 371963724.0, + "step": 9747 + }, + { + "epoch": 1.2400457957002926, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7516173124313354, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8616708517074585, + "num_tokens": 372002220.0, + "step": 9748 + }, + { + "epoch": 1.2401730059788831, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5236217975616455, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8794432878494263, + "num_tokens": 372044679.0, + "step": 9749 + }, + { + "epoch": 1.2403002162574737, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6644821166992188, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8781693577766418, + "num_tokens": 372081186.0, + "step": 9750 + }, + { + "epoch": 1.2404274265360642, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5714479684829712, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8916685581207275, + "num_tokens": 372119700.0, + "step": 9751 + }, + { + "epoch": 1.2405546368146547, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5647982358932495, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8696556687355042, + "num_tokens": 372158443.0, + "step": 9752 + }, + { + "epoch": 1.2406818470932452, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.64727783203125, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8715516328811646, + "num_tokens": 372193855.0, + "step": 9753 + }, + { + "epoch": 1.2408090573718356, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5868116617202759, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8776520490646362, + "num_tokens": 372232991.0, + "step": 9754 + }, + { + "epoch": 1.240936267650426, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6539762020111084, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8673008680343628, + "num_tokens": 372267919.0, + "step": 9755 + }, + { + "epoch": 1.2410634779290166, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.702440619468689, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8699359893798828, + "num_tokens": 372307381.0, + "step": 9756 + }, + { + "epoch": 1.2411906882076071, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7126948833465576, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8394210934638977, + "num_tokens": 372346074.0, + "step": 9757 + }, + { + "epoch": 1.2413178984861977, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7897796630859375, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8573121428489685, + "num_tokens": 372377841.0, + "step": 9758 + }, + { + "epoch": 1.2414451087647882, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5622892379760742, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.876847505569458, + "num_tokens": 372414433.0, + "step": 9759 + }, + { + "epoch": 1.2415723190433787, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4959508180618286, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8802565336227417, + "num_tokens": 372456510.0, + "step": 9760 + }, + { + "epoch": 1.2416995293219693, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6358407735824585, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8717061281204224, + "num_tokens": 372496311.0, + "step": 9761 + }, + { + "epoch": 1.2418267396005598, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4837486743927002, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8585017919540405, + "num_tokens": 372541064.0, + "step": 9762 + }, + { + "epoch": 1.2419539498791503, + "ewc_loss": 2.181529998779297e-05, + "grad_norm": 1.5786283016204834, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8622661232948303, + "num_tokens": 372582711.0, + "step": 9763 + }, + { + "epoch": 1.2420811601577408, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.605157732963562, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8756378889083862, + "num_tokens": 372618775.0, + "step": 9764 + }, + { + "epoch": 1.2422083704363311, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.585485577583313, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8685609698295593, + "num_tokens": 372657955.0, + "step": 9765 + }, + { + "epoch": 1.2423355807149217, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.543673038482666, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8682887554168701, + "num_tokens": 372700048.0, + "step": 9766 + }, + { + "epoch": 1.2424627909935122, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5767436027526855, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8661781549453735, + "num_tokens": 372742341.0, + "step": 9767 + }, + { + "epoch": 1.2425900012721027, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6588233709335327, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8609062433242798, + "num_tokens": 372776760.0, + "step": 9768 + }, + { + "epoch": 1.2427172115506933, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5020955801010132, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.875391960144043, + "num_tokens": 372817108.0, + "step": 9769 + }, + { + "epoch": 1.2428444218292838, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5729811191558838, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8598295450210571, + "num_tokens": 372856976.0, + "step": 9770 + }, + { + "epoch": 1.2429716321078743, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6893075704574585, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8367218971252441, + "num_tokens": 372896376.0, + "step": 9771 + }, + { + "epoch": 1.2430988423864648, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.9326705932617188, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8661854863166809, + "num_tokens": 372932114.0, + "step": 9772 + }, + { + "epoch": 1.2432260526650554, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5228087902069092, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8817168474197388, + "num_tokens": 372972557.0, + "step": 9773 + }, + { + "epoch": 1.243353262943646, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5996991395950317, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.871346116065979, + "num_tokens": 373014111.0, + "step": 9774 + }, + { + "epoch": 1.2434804732222364, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6290793418884277, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.868211030960083, + "num_tokens": 373049739.0, + "step": 9775 + }, + { + "epoch": 1.243607683500827, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5820351839065552, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8505134582519531, + "num_tokens": 373092894.0, + "step": 9776 + }, + { + "epoch": 1.2437348937794175, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5539146661758423, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8799549341201782, + "num_tokens": 373131767.0, + "step": 9777 + }, + { + "epoch": 1.243862104058008, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7230281829833984, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.850741982460022, + "num_tokens": 373171310.0, + "step": 9778 + }, + { + "epoch": 1.2439893143365983, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4774158000946045, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8814388513565063, + "num_tokens": 373211454.0, + "step": 9779 + }, + { + "epoch": 1.2441165246151888, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.431314468383789, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8739698529243469, + "num_tokens": 373255535.0, + "step": 9780 + }, + { + "epoch": 1.2442437348937794, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4800907373428345, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8770101070404053, + "num_tokens": 373297993.0, + "step": 9781 + }, + { + "epoch": 1.24437094517237, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4777556657791138, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8778930306434631, + "num_tokens": 373338042.0, + "step": 9782 + }, + { + "epoch": 1.2444981554509604, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5975089073181152, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8561462759971619, + "num_tokens": 373377042.0, + "step": 9783 + }, + { + "epoch": 1.244625365729551, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6262751817703247, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.874172031879425, + "num_tokens": 373415377.0, + "step": 9784 + }, + { + "epoch": 1.2447525760081415, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6443464756011963, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8663791418075562, + "num_tokens": 373454320.0, + "step": 9785 + }, + { + "epoch": 1.244879786286732, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5049915313720703, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8674993515014648, + "num_tokens": 373495915.0, + "step": 9786 + }, + { + "epoch": 1.2450069965653225, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5115337371826172, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8793540000915527, + "num_tokens": 373537044.0, + "step": 9787 + }, + { + "epoch": 1.245134206843913, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5445022583007812, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8860645294189453, + "num_tokens": 373573404.0, + "step": 9788 + }, + { + "epoch": 1.2452614171225034, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6836440563201904, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8643853068351746, + "num_tokens": 373608541.0, + "step": 9789 + }, + { + "epoch": 1.245388627401094, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6140611171722412, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8577545881271362, + "num_tokens": 373654282.0, + "step": 9790 + }, + { + "epoch": 1.2455158376796844, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6036500930786133, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8845257759094238, + "num_tokens": 373687736.0, + "step": 9791 + }, + { + "epoch": 1.245643047958275, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6709418296813965, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8649031519889832, + "num_tokens": 373726207.0, + "step": 9792 + }, + { + "epoch": 1.2457702582368655, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6429545879364014, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8664786219596863, + "num_tokens": 373767443.0, + "step": 9793 + }, + { + "epoch": 1.245897468515456, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4677003622055054, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8816546201705933, + "num_tokens": 373809340.0, + "step": 9794 + }, + { + "epoch": 1.2460246787940465, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.623079538345337, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8762294054031372, + "num_tokens": 373845902.0, + "step": 9795 + }, + { + "epoch": 1.246151889072637, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6562491655349731, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8614519834518433, + "num_tokens": 373883973.0, + "step": 9796 + }, + { + "epoch": 1.2462790993512276, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7206388711929321, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8668373227119446, + "num_tokens": 373918732.0, + "step": 9797 + }, + { + "epoch": 1.2464063096298181, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6510717868804932, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8638625144958496, + "num_tokens": 373959772.0, + "step": 9798 + }, + { + "epoch": 1.2465335199084087, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6949068307876587, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8572869896888733, + "num_tokens": 374000151.0, + "step": 9799 + }, + { + "epoch": 1.2466607301869992, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.581382155418396, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8726824522018433, + "num_tokens": 374039415.0, + "step": 9800 + }, + { + "epoch": 1.2467879404655897, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.54056978225708, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8722914457321167, + "num_tokens": 374083568.0, + "step": 9801 + }, + { + "epoch": 1.2469151507441802, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.67247474193573, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8712807893753052, + "num_tokens": 374118762.0, + "step": 9802 + }, + { + "epoch": 1.2470423610227706, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5397834777832031, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8637110590934753, + "num_tokens": 374160262.0, + "step": 9803 + }, + { + "epoch": 1.247169571301361, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5759880542755127, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8633725643157959, + "num_tokens": 374200619.0, + "step": 9804 + }, + { + "epoch": 1.2472967815799516, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5588656663894653, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8785604238510132, + "num_tokens": 374238303.0, + "step": 9805 + }, + { + "epoch": 1.2474239918585421, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5411016941070557, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8810552358627319, + "num_tokens": 374278849.0, + "step": 9806 + }, + { + "epoch": 1.2475512021371327, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7399755716323853, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8646255731582642, + "num_tokens": 374320983.0, + "step": 9807 + }, + { + "epoch": 1.2476784124157232, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.54414963722229, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.885651707649231, + "num_tokens": 374354520.0, + "step": 9808 + }, + { + "epoch": 1.2478056226943137, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7491798400878906, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8610158562660217, + "num_tokens": 374388411.0, + "step": 9809 + }, + { + "epoch": 1.2479328329729042, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6750630140304565, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8604309558868408, + "num_tokens": 374425076.0, + "step": 9810 + }, + { + "epoch": 1.2480600432514948, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5125659704208374, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8765513896942139, + "num_tokens": 374466219.0, + "step": 9811 + }, + { + "epoch": 1.2481872535300853, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7893832921981812, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8735088109970093, + "num_tokens": 374497652.0, + "step": 9812 + }, + { + "epoch": 1.2483144638086758, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5444759130477905, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8673897385597229, + "num_tokens": 374535379.0, + "step": 9813 + }, + { + "epoch": 1.2484416740872661, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.4468995332717896, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8714043498039246, + "num_tokens": 374579779.0, + "step": 9814 + }, + { + "epoch": 1.2485688843658567, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5455706119537354, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8715078830718994, + "num_tokens": 374620582.0, + "step": 9815 + }, + { + "epoch": 1.2486960946444472, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6026127338409424, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8649036884307861, + "num_tokens": 374659470.0, + "step": 9816 + }, + { + "epoch": 1.2488233049230377, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5490727424621582, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8657302856445312, + "num_tokens": 374704394.0, + "step": 9817 + }, + { + "epoch": 1.2489505152016283, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.499088168144226, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8758712410926819, + "num_tokens": 374745338.0, + "step": 9818 + }, + { + "epoch": 1.2490777254802188, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7261607646942139, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8642914295196533, + "num_tokens": 374781079.0, + "step": 9819 + }, + { + "epoch": 1.2492049357588093, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6372580528259277, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8674771189689636, + "num_tokens": 374820301.0, + "step": 9820 + }, + { + "epoch": 1.2493321460373998, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.612239122390747, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8680536150932312, + "num_tokens": 374858194.0, + "step": 9821 + }, + { + "epoch": 1.2494593563159904, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5994625091552734, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8691373467445374, + "num_tokens": 374896818.0, + "step": 9822 + }, + { + "epoch": 1.249586566594581, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5654836893081665, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8626247644424438, + "num_tokens": 374937202.0, + "step": 9823 + }, + { + "epoch": 1.2497137768731714, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6176846027374268, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8752217292785645, + "num_tokens": 374971036.0, + "step": 9824 + }, + { + "epoch": 1.249840987151762, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5131971836090088, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8603529930114746, + "num_tokens": 375015280.0, + "step": 9825 + }, + { + "epoch": 1.2499681974303525, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5194180011749268, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8745315074920654, + "num_tokens": 375056517.0, + "step": 9826 + }, + { + "epoch": 1.250095407708943, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.675281047821045, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8602339625358582, + "num_tokens": 375096874.0, + "step": 9827 + }, + { + "epoch": 1.2502226179875333, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6135814189910889, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8706219792366028, + "num_tokens": 375134957.0, + "step": 9828 + }, + { + "epoch": 1.2503498282661238, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6415674686431885, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8654859066009521, + "num_tokens": 375173149.0, + "step": 9829 + }, + { + "epoch": 1.2504770385447144, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.7820385694503784, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8737951517105103, + "num_tokens": 375205403.0, + "step": 9830 + }, + { + "epoch": 1.250604248823305, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6290247440338135, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8641185760498047, + "num_tokens": 375244082.0, + "step": 9831 + }, + { + "epoch": 1.2507314591018954, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.5910649299621582, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8729033470153809, + "num_tokens": 375285459.0, + "step": 9832 + }, + { + "epoch": 1.250858669380486, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.655448317527771, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8676272034645081, + "num_tokens": 375319766.0, + "step": 9833 + }, + { + "epoch": 1.2509858796590765, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.6737862825393677, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8655921816825867, + "num_tokens": 375356945.0, + "step": 9834 + }, + { + "epoch": 1.251113089937667, + "ewc_loss": 2.1696090698242188e-05, + "grad_norm": 1.8760319948196411, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8622617721557617, + "num_tokens": 375387296.0, + "step": 9835 + }, + { + "epoch": 1.2512403002162575, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5287964344024658, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8655019998550415, + "num_tokens": 375427961.0, + "step": 9836 + }, + { + "epoch": 1.2513675104948478, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6002519130706787, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8704272508621216, + "num_tokens": 375469389.0, + "step": 9837 + }, + { + "epoch": 1.2514947207734384, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6238559484481812, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8674465417861938, + "num_tokens": 375508004.0, + "step": 9838 + }, + { + "epoch": 1.251621931052029, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6241377592086792, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8710679411888123, + "num_tokens": 375547965.0, + "step": 9839 + }, + { + "epoch": 1.2517491413306194, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6160269975662231, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8759171962738037, + "num_tokens": 375588445.0, + "step": 9840 + }, + { + "epoch": 1.25187635160921, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5614744424819946, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8642666339874268, + "num_tokens": 375628894.0, + "step": 9841 + }, + { + "epoch": 1.2520035618878005, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6243199110031128, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8587884306907654, + "num_tokens": 375667143.0, + "step": 9842 + }, + { + "epoch": 1.252130772166391, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6037371158599854, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8727598190307617, + "num_tokens": 375706633.0, + "step": 9843 + }, + { + "epoch": 1.2522579824449815, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6839373111724854, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8766995668411255, + "num_tokens": 375741242.0, + "step": 9844 + }, + { + "epoch": 1.252385192723572, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6058297157287598, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8781563639640808, + "num_tokens": 375777435.0, + "step": 9845 + }, + { + "epoch": 1.2525124030021626, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5757750272750854, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8640495538711548, + "num_tokens": 375818901.0, + "step": 9846 + }, + { + "epoch": 1.2526396132807531, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.4659247398376465, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8788533210754395, + "num_tokens": 375866427.0, + "step": 9847 + }, + { + "epoch": 1.2527668235593437, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7033545970916748, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8701962828636169, + "num_tokens": 375904723.0, + "step": 9848 + }, + { + "epoch": 1.2528940338379342, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.602523684501648, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8735299110412598, + "num_tokens": 375942604.0, + "step": 9849 + }, + { + "epoch": 1.2530212441165247, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.712469220161438, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8699055910110474, + "num_tokens": 375976569.0, + "step": 9850 + }, + { + "epoch": 1.2531484543951152, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5835484266281128, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8675055503845215, + "num_tokens": 376016746.0, + "step": 9851 + }, + { + "epoch": 1.2532756646737058, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5630472898483276, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8650006651878357, + "num_tokens": 376056749.0, + "step": 9852 + }, + { + "epoch": 1.253402874952296, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6527003049850464, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8705433011054993, + "num_tokens": 376096787.0, + "step": 9853 + }, + { + "epoch": 1.2535300852308866, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.587942361831665, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8755887150764465, + "num_tokens": 376136055.0, + "step": 9854 + }, + { + "epoch": 1.2536572955094771, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6473302841186523, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8664063215255737, + "num_tokens": 376174621.0, + "step": 9855 + }, + { + "epoch": 1.2537845057880677, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5142312049865723, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8676252961158752, + "num_tokens": 376214407.0, + "step": 9856 + }, + { + "epoch": 1.2539117160666582, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5481528043746948, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8838472366333008, + "num_tokens": 376251240.0, + "step": 9857 + }, + { + "epoch": 1.2540389263452487, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.55447518825531, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8718642592430115, + "num_tokens": 376290959.0, + "step": 9858 + }, + { + "epoch": 1.2541661366238392, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7294267416000366, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8766776323318481, + "num_tokens": 376328282.0, + "step": 9859 + }, + { + "epoch": 1.2542933469024298, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.833026647567749, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8707376718521118, + "num_tokens": 376366266.0, + "step": 9860 + }, + { + "epoch": 1.2544205571810203, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.74897301197052, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8514593839645386, + "num_tokens": 376404194.0, + "step": 9861 + }, + { + "epoch": 1.2545477674596106, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6999030113220215, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8811047077178955, + "num_tokens": 376438664.0, + "step": 9862 + }, + { + "epoch": 1.2546749777382011, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.545372724533081, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8608812093734741, + "num_tokens": 376477565.0, + "step": 9863 + }, + { + "epoch": 1.2548021880167917, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.709814190864563, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8786033987998962, + "num_tokens": 376513634.0, + "step": 9864 + }, + { + "epoch": 1.2549293982953822, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.537023663520813, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8679553270339966, + "num_tokens": 376556957.0, + "step": 9865 + }, + { + "epoch": 1.2550566085739727, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6726447343826294, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8562057614326477, + "num_tokens": 376593986.0, + "step": 9866 + }, + { + "epoch": 1.2551838188525632, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6295702457427979, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8643077611923218, + "num_tokens": 376630344.0, + "step": 9867 + }, + { + "epoch": 1.2553110291311538, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.510115385055542, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8743674755096436, + "num_tokens": 376671556.0, + "step": 9868 + }, + { + "epoch": 1.2554382394097443, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.4738579988479614, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8799569606781006, + "num_tokens": 376713127.0, + "step": 9869 + }, + { + "epoch": 1.2555654496883348, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6202422380447388, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8616390228271484, + "num_tokens": 376751505.0, + "step": 9870 + }, + { + "epoch": 1.2556926599669254, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.650930643081665, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8657461404800415, + "num_tokens": 376790822.0, + "step": 9871 + }, + { + "epoch": 1.255819870245516, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5838266611099243, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8721172213554382, + "num_tokens": 376834412.0, + "step": 9872 + }, + { + "epoch": 1.2559470805241064, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5406599044799805, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8586297035217285, + "num_tokens": 376877433.0, + "step": 9873 + }, + { + "epoch": 1.256074290802697, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7322442531585693, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8688905239105225, + "num_tokens": 376914797.0, + "step": 9874 + }, + { + "epoch": 1.2562015010812875, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7315394878387451, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8626335859298706, + "num_tokens": 376954575.0, + "step": 9875 + }, + { + "epoch": 1.256328711359878, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5422190427780151, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8823468685150146, + "num_tokens": 376990578.0, + "step": 9876 + }, + { + "epoch": 1.2564559216384683, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.4545978307724, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8875288963317871, + "num_tokens": 377034697.0, + "step": 9877 + }, + { + "epoch": 1.2565831319170588, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6194794178009033, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.878968358039856, + "num_tokens": 377075008.0, + "step": 9878 + }, + { + "epoch": 1.2567103421956494, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.682593822479248, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8628340363502502, + "num_tokens": 377110032.0, + "step": 9879 + }, + { + "epoch": 1.25683755247424, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6838926076889038, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8721290826797485, + "num_tokens": 377147797.0, + "step": 9880 + }, + { + "epoch": 1.2569647627528304, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6836670637130737, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8695914149284363, + "num_tokens": 377184920.0, + "step": 9881 + }, + { + "epoch": 1.257091973031421, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.8619637489318848, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8742584586143494, + "num_tokens": 377217287.0, + "step": 9882 + }, + { + "epoch": 1.2572191833100115, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.593770980834961, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.879070520401001, + "num_tokens": 377255120.0, + "step": 9883 + }, + { + "epoch": 1.257346393588602, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.512007474899292, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8601213693618774, + "num_tokens": 377299682.0, + "step": 9884 + }, + { + "epoch": 1.2574736038671925, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6161988973617554, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8611335754394531, + "num_tokens": 377342718.0, + "step": 9885 + }, + { + "epoch": 1.2576008141457828, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5850590467453003, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8797166347503662, + "num_tokens": 377380031.0, + "step": 9886 + }, + { + "epoch": 1.2577280244243734, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6480159759521484, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8790058493614197, + "num_tokens": 377416058.0, + "step": 9887 + }, + { + "epoch": 1.257855234702964, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7171659469604492, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8603488206863403, + "num_tokens": 377455817.0, + "step": 9888 + }, + { + "epoch": 1.2579824449815544, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.8525314331054688, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8571494817733765, + "num_tokens": 377492316.0, + "step": 9889 + }, + { + "epoch": 1.258109655260145, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7162747383117676, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8679500222206116, + "num_tokens": 377528719.0, + "step": 9890 + }, + { + "epoch": 1.2582368655387355, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6501743793487549, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.850678026676178, + "num_tokens": 377571622.0, + "step": 9891 + }, + { + "epoch": 1.258364075817326, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7200452089309692, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8582210540771484, + "num_tokens": 377606473.0, + "step": 9892 + }, + { + "epoch": 1.2584912860959165, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6019865274429321, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8675578236579895, + "num_tokens": 377649485.0, + "step": 9893 + }, + { + "epoch": 1.258618496374507, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5698672533035278, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8823179006576538, + "num_tokens": 377689039.0, + "step": 9894 + }, + { + "epoch": 1.2587457066530976, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6090084314346313, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8710697889328003, + "num_tokens": 377727846.0, + "step": 9895 + }, + { + "epoch": 1.2588729169316881, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.9083820581436157, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8636461496353149, + "num_tokens": 377759371.0, + "step": 9896 + }, + { + "epoch": 1.2590001272102787, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6055867671966553, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.878534197807312, + "num_tokens": 377799952.0, + "step": 9897 + }, + { + "epoch": 1.2591273374888692, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 3.747335910797119, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8742060661315918, + "num_tokens": 377837794.0, + "step": 9898 + }, + { + "epoch": 1.2592545477674597, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6225982904434204, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8692926168441772, + "num_tokens": 377879482.0, + "step": 9899 + }, + { + "epoch": 1.2593817580460502, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6998552083969116, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8697237372398376, + "num_tokens": 377917651.0, + "step": 9900 + }, + { + "epoch": 1.2595089683246408, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5909723043441772, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.880321741104126, + "num_tokens": 377953399.0, + "step": 9901 + }, + { + "epoch": 1.259636178603231, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7233175039291382, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8557922840118408, + "num_tokens": 377994125.0, + "step": 9902 + }, + { + "epoch": 1.2597633888818216, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5475242137908936, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8773318529129028, + "num_tokens": 378034891.0, + "step": 9903 + }, + { + "epoch": 1.2598905991604121, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6554964780807495, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8794010877609253, + "num_tokens": 378070107.0, + "step": 9904 + }, + { + "epoch": 1.2600178094390027, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.8693263530731201, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8684670329093933, + "num_tokens": 378102078.0, + "step": 9905 + }, + { + "epoch": 1.2601450197175932, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6326626539230347, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8778402209281921, + "num_tokens": 378141807.0, + "step": 9906 + }, + { + "epoch": 1.2602722299961837, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6254678964614868, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8708129525184631, + "num_tokens": 378181888.0, + "step": 9907 + }, + { + "epoch": 1.2603994402747742, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6619945764541626, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8698909878730774, + "num_tokens": 378219498.0, + "step": 9908 + }, + { + "epoch": 1.2605266505533648, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5732823610305786, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8821185231208801, + "num_tokens": 378256587.0, + "step": 9909 + }, + { + "epoch": 1.2606538608319553, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7445099353790283, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8669530749320984, + "num_tokens": 378295241.0, + "step": 9910 + }, + { + "epoch": 1.2607810711105456, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.541263461112976, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8728722929954529, + "num_tokens": 378334939.0, + "step": 9911 + }, + { + "epoch": 1.2609082813891361, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7020082473754883, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8793553113937378, + "num_tokens": 378368481.0, + "step": 9912 + }, + { + "epoch": 1.2610354916677267, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6369072198867798, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8720861673355103, + "num_tokens": 378404544.0, + "step": 9913 + }, + { + "epoch": 1.2611627019463172, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5459246635437012, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8795320987701416, + "num_tokens": 378440816.0, + "step": 9914 + }, + { + "epoch": 1.2612899122249077, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7021538019180298, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8821414113044739, + "num_tokens": 378478586.0, + "step": 9915 + }, + { + "epoch": 1.2614171225034982, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.8221052885055542, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8638423085212708, + "num_tokens": 378512947.0, + "step": 9916 + }, + { + "epoch": 1.2615443327820888, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6636621952056885, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8777016401290894, + "num_tokens": 378548291.0, + "step": 9917 + }, + { + "epoch": 1.2616715430606793, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7267240285873413, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8769734501838684, + "num_tokens": 378580549.0, + "step": 9918 + }, + { + "epoch": 1.2617987533392698, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.4973416328430176, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.885319709777832, + "num_tokens": 378620531.0, + "step": 9919 + }, + { + "epoch": 1.2619259636178604, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.661057710647583, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8710160851478577, + "num_tokens": 378657448.0, + "step": 9920 + }, + { + "epoch": 1.2620531738964509, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5430759191513062, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8658446073532104, + "num_tokens": 378698780.0, + "step": 9921 + }, + { + "epoch": 1.2621803841750414, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7005512714385986, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8841913342475891, + "num_tokens": 378728665.0, + "step": 9922 + }, + { + "epoch": 1.262307594453632, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6192768812179565, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8725184798240662, + "num_tokens": 378763385.0, + "step": 9923 + }, + { + "epoch": 1.2624348047322225, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 3.6327953338623047, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8874592781066895, + "num_tokens": 378807132.0, + "step": 9924 + }, + { + "epoch": 1.262562015010813, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.549437165260315, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8805159330368042, + "num_tokens": 378844605.0, + "step": 9925 + }, + { + "epoch": 1.2626892252894033, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.607457160949707, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8784394264221191, + "num_tokens": 378880083.0, + "step": 9926 + }, + { + "epoch": 1.2628164355679938, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5578700304031372, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8603209853172302, + "num_tokens": 378922832.0, + "step": 9927 + }, + { + "epoch": 1.2629436458465844, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5676417350769043, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8694667220115662, + "num_tokens": 378963771.0, + "step": 9928 + }, + { + "epoch": 1.263070856125175, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5634044408798218, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8691370487213135, + "num_tokens": 379003967.0, + "step": 9929 + }, + { + "epoch": 1.2631980664037654, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6276224851608276, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.885036289691925, + "num_tokens": 379036249.0, + "step": 9930 + }, + { + "epoch": 1.263325276682356, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7327460050582886, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8690494298934937, + "num_tokens": 379069850.0, + "step": 9931 + }, + { + "epoch": 1.2634524869609465, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.8356916904449463, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8734478950500488, + "num_tokens": 379104578.0, + "step": 9932 + }, + { + "epoch": 1.263579697239537, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5055091381072998, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8836653232574463, + "num_tokens": 379142059.0, + "step": 9933 + }, + { + "epoch": 1.2637069075181275, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5817749500274658, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8582819700241089, + "num_tokens": 379187393.0, + "step": 9934 + }, + { + "epoch": 1.2638341177967178, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5765715837478638, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8700397610664368, + "num_tokens": 379222988.0, + "step": 9935 + }, + { + "epoch": 1.2639613280753084, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7033072710037231, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8723785281181335, + "num_tokens": 379255358.0, + "step": 9936 + }, + { + "epoch": 1.264088538353899, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7340648174285889, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8552338480949402, + "num_tokens": 379298065.0, + "step": 9937 + }, + { + "epoch": 1.2642157486324894, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6932768821716309, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8683825135231018, + "num_tokens": 379336223.0, + "step": 9938 + }, + { + "epoch": 1.26434295891108, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7516897916793823, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8546016216278076, + "num_tokens": 379377076.0, + "step": 9939 + }, + { + "epoch": 1.2644701691896705, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.59895920753479, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.857842206954956, + "num_tokens": 379421573.0, + "step": 9940 + }, + { + "epoch": 1.264597379468261, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6348533630371094, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8810835480690002, + "num_tokens": 379458515.0, + "step": 9941 + }, + { + "epoch": 1.2647245897468515, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7156505584716797, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8637101054191589, + "num_tokens": 379496509.0, + "step": 9942 + }, + { + "epoch": 1.264851800025442, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5939421653747559, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8682619333267212, + "num_tokens": 379538996.0, + "step": 9943 + }, + { + "epoch": 1.2649790103040326, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6281579732894897, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8575239777565002, + "num_tokens": 379580595.0, + "step": 9944 + }, + { + "epoch": 1.2651062205826231, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.4436298608779907, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8833097219467163, + "num_tokens": 379622982.0, + "step": 9945 + }, + { + "epoch": 1.2652334308612136, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.656907081604004, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8679381608963013, + "num_tokens": 379658796.0, + "step": 9946 + }, + { + "epoch": 1.2653606411398042, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7026399374008179, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8724249601364136, + "num_tokens": 379695509.0, + "step": 9947 + }, + { + "epoch": 1.2654878514183947, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.634067177772522, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8673602938652039, + "num_tokens": 379732692.0, + "step": 9948 + }, + { + "epoch": 1.2656150616969852, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.623274803161621, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8746556043624878, + "num_tokens": 379768716.0, + "step": 9949 + }, + { + "epoch": 1.2657422719755758, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5921976566314697, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8847962617874146, + "num_tokens": 379804857.0, + "step": 9950 + }, + { + "epoch": 1.265869482254166, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5597995519638062, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.873132586479187, + "num_tokens": 379842882.0, + "step": 9951 + }, + { + "epoch": 1.2659966925327566, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.625113844871521, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8691322803497314, + "num_tokens": 379887099.0, + "step": 9952 + }, + { + "epoch": 1.2661239028113471, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6453083753585815, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8821355104446411, + "num_tokens": 379927598.0, + "step": 9953 + }, + { + "epoch": 1.2662511130899377, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.596600890159607, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8713502883911133, + "num_tokens": 379964548.0, + "step": 9954 + }, + { + "epoch": 1.2663783233685282, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7511826753616333, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8765733242034912, + "num_tokens": 380003924.0, + "step": 9955 + }, + { + "epoch": 1.2665055336471187, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.689536690711975, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8790643811225891, + "num_tokens": 380039966.0, + "step": 9956 + }, + { + "epoch": 1.2666327439257092, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6820436716079712, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8683658838272095, + "num_tokens": 380073340.0, + "step": 9957 + }, + { + "epoch": 1.2667599542042998, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6521023511886597, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8648125529289246, + "num_tokens": 380113687.0, + "step": 9958 + }, + { + "epoch": 1.2668871644828903, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6961735486984253, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8636038303375244, + "num_tokens": 380154795.0, + "step": 9959 + }, + { + "epoch": 1.2670143747614806, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6199744939804077, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8749388456344604, + "num_tokens": 380192667.0, + "step": 9960 + }, + { + "epoch": 1.2671415850400711, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.63338041305542, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.879525899887085, + "num_tokens": 380226023.0, + "step": 9961 + }, + { + "epoch": 1.2672687953186617, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6906477212905884, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8678113222122192, + "num_tokens": 380262321.0, + "step": 9962 + }, + { + "epoch": 1.2673960055972522, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.4588286876678467, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8891998529434204, + "num_tokens": 380303331.0, + "step": 9963 + }, + { + "epoch": 1.2675232158758427, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6064015626907349, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8864179849624634, + "num_tokens": 380341654.0, + "step": 9964 + }, + { + "epoch": 1.2676504261544332, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5270739793777466, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8826600313186646, + "num_tokens": 380377187.0, + "step": 9965 + }, + { + "epoch": 1.2677776364330238, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7373933792114258, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8632418513298035, + "num_tokens": 380411609.0, + "step": 9966 + }, + { + "epoch": 1.2679048467116143, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6426748037338257, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8727187514305115, + "num_tokens": 380451372.0, + "step": 9967 + }, + { + "epoch": 1.2680320569902048, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.622409462928772, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8648188710212708, + "num_tokens": 380489626.0, + "step": 9968 + }, + { + "epoch": 1.2681592672687954, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.607915997505188, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8830109238624573, + "num_tokens": 380528945.0, + "step": 9969 + }, + { + "epoch": 1.2682864775473859, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5326465368270874, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8668984174728394, + "num_tokens": 380573751.0, + "step": 9970 + }, + { + "epoch": 1.2684136878259764, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6449655294418335, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8493354320526123, + "num_tokens": 380619235.0, + "step": 9971 + }, + { + "epoch": 1.268540898104567, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5077342987060547, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8851374387741089, + "num_tokens": 380661396.0, + "step": 9972 + }, + { + "epoch": 1.2686681083831575, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5532598495483398, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8768813610076904, + "num_tokens": 380701224.0, + "step": 9973 + }, + { + "epoch": 1.268795318661748, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.571151614189148, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8742055892944336, + "num_tokens": 380737947.0, + "step": 9974 + }, + { + "epoch": 1.2689225289403383, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5816845893859863, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8671442270278931, + "num_tokens": 380779805.0, + "step": 9975 + }, + { + "epoch": 1.2690497392189288, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 3.7177062034606934, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8789923191070557, + "num_tokens": 380816776.0, + "step": 9976 + }, + { + "epoch": 1.2691769494975194, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.71564519405365, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8841620683670044, + "num_tokens": 380849680.0, + "step": 9977 + }, + { + "epoch": 1.2693041597761099, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.455520749092102, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8725973963737488, + "num_tokens": 380893724.0, + "step": 9978 + }, + { + "epoch": 1.2694313700547004, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.628586769104004, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8706413507461548, + "num_tokens": 380933230.0, + "step": 9979 + }, + { + "epoch": 1.269558580333291, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.517490267753601, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8691480755805969, + "num_tokens": 380975088.0, + "step": 9980 + }, + { + "epoch": 1.2696857906118815, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6528651714324951, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8620537519454956, + "num_tokens": 381016161.0, + "step": 9981 + }, + { + "epoch": 1.269813000890472, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5273969173431396, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8882768154144287, + "num_tokens": 381056090.0, + "step": 9982 + }, + { + "epoch": 1.2699402111690625, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.614189624786377, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8717629909515381, + "num_tokens": 381090774.0, + "step": 9983 + }, + { + "epoch": 1.2700674214476528, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6982176303863525, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8567017316818237, + "num_tokens": 381128218.0, + "step": 9984 + }, + { + "epoch": 1.2701946317262434, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6821759939193726, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8549943566322327, + "num_tokens": 381167633.0, + "step": 9985 + }, + { + "epoch": 1.270321842004834, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.806777834892273, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8712378740310669, + "num_tokens": 381204597.0, + "step": 9986 + }, + { + "epoch": 1.2704490522834244, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.516033411026001, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8858204483985901, + "num_tokens": 381242533.0, + "step": 9987 + }, + { + "epoch": 1.270576262562015, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5548758506774902, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8623088598251343, + "num_tokens": 381285148.0, + "step": 9988 + }, + { + "epoch": 1.2707034728406055, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6621105670928955, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8802105784416199, + "num_tokens": 381320011.0, + "step": 9989 + }, + { + "epoch": 1.270830683119196, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6334941387176514, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8625300526618958, + "num_tokens": 381356235.0, + "step": 9990 + }, + { + "epoch": 1.2709578933977865, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.505567193031311, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.880691409111023, + "num_tokens": 381396188.0, + "step": 9991 + }, + { + "epoch": 1.271085103676377, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6848567724227905, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8671656847000122, + "num_tokens": 381430994.0, + "step": 9992 + }, + { + "epoch": 1.2712123139549676, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.8160104751586914, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8685472011566162, + "num_tokens": 381462869.0, + "step": 9993 + }, + { + "epoch": 1.2713395242335581, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.8073217868804932, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8707433938980103, + "num_tokens": 381494577.0, + "step": 9994 + }, + { + "epoch": 1.2714667345121486, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5506871938705444, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8752560019493103, + "num_tokens": 381535684.0, + "step": 9995 + }, + { + "epoch": 1.2715939447907392, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7099664211273193, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8857643008232117, + "num_tokens": 381567225.0, + "step": 9996 + }, + { + "epoch": 1.2717211550693297, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.5079874992370605, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8580092787742615, + "num_tokens": 381610007.0, + "step": 9997 + }, + { + "epoch": 1.2718483653479202, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7663724422454834, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.863783597946167, + "num_tokens": 381650788.0, + "step": 9998 + }, + { + "epoch": 1.2719755756265108, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6381254196166992, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8638731241226196, + "num_tokens": 381692625.0, + "step": 9999 + }, + { + "epoch": 1.272102785905101, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.511143445968628, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8680383563041687, + "num_tokens": 381735848.0, + "step": 10000 + }, + { + "epoch": 1.2722299961836916, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.680137038230896, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8721984624862671, + "num_tokens": 381769633.0, + "step": 10001 + }, + { + "epoch": 1.2723572064622821, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.620882511138916, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8629305362701416, + "num_tokens": 381810253.0, + "step": 10002 + }, + { + "epoch": 1.2724844167408726, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7148209810256958, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8691158294677734, + "num_tokens": 381848344.0, + "step": 10003 + }, + { + "epoch": 1.2726116270194632, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5376307964324951, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8595535755157471, + "num_tokens": 381893383.0, + "step": 10004 + }, + { + "epoch": 1.2727388372980537, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.588541030883789, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8685077428817749, + "num_tokens": 381935634.0, + "step": 10005 + }, + { + "epoch": 1.2728660475766442, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6166502237319946, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8696682453155518, + "num_tokens": 381972953.0, + "step": 10006 + }, + { + "epoch": 1.2729932578552348, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7103888988494873, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8587231040000916, + "num_tokens": 382010283.0, + "step": 10007 + }, + { + "epoch": 1.2731204681338253, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.7137187719345093, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8854727745056152, + "num_tokens": 382046263.0, + "step": 10008 + }, + { + "epoch": 1.2732476784124156, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.625991940498352, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8671536445617676, + "num_tokens": 382087695.0, + "step": 10009 + }, + { + "epoch": 1.2733748886910061, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.8267327547073364, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8476775884628296, + "num_tokens": 382123226.0, + "step": 10010 + }, + { + "epoch": 1.2735020989695967, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.690114140510559, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8679715394973755, + "num_tokens": 382161772.0, + "step": 10011 + }, + { + "epoch": 1.2736293092481872, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.766552209854126, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8630027770996094, + "num_tokens": 382197895.0, + "step": 10012 + }, + { + "epoch": 1.2737565195267777, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5997079610824585, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8906158208847046, + "num_tokens": 382233327.0, + "step": 10013 + }, + { + "epoch": 1.2738837298053682, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5027827024459839, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8723520636558533, + "num_tokens": 382273215.0, + "step": 10014 + }, + { + "epoch": 1.2740109400839588, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5718530416488647, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8791522979736328, + "num_tokens": 382311037.0, + "step": 10015 + }, + { + "epoch": 1.2741381503625493, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7147575616836548, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.871583104133606, + "num_tokens": 382344864.0, + "step": 10016 + }, + { + "epoch": 1.2742653606411398, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6357874870300293, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8727810382843018, + "num_tokens": 382383065.0, + "step": 10017 + }, + { + "epoch": 1.2743925709197303, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6674237251281738, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.88096022605896, + "num_tokens": 382417347.0, + "step": 10018 + }, + { + "epoch": 1.2745197811983209, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.633646845817566, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8683269023895264, + "num_tokens": 382454604.0, + "step": 10019 + }, + { + "epoch": 1.2746469914769114, + "ewc_loss": 2.193450927734375e-05, + "grad_norm": 1.6419620513916016, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8865131735801697, + "num_tokens": 382487967.0, + "step": 10020 + }, + { + "epoch": 1.274774201755502, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.615729570388794, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8708980679512024, + "num_tokens": 382526389.0, + "step": 10021 + }, + { + "epoch": 1.2749014120340925, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6814996004104614, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8835490942001343, + "num_tokens": 382561578.0, + "step": 10022 + }, + { + "epoch": 1.275028622312683, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6075959205627441, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8722920417785645, + "num_tokens": 382600159.0, + "step": 10023 + }, + { + "epoch": 1.2751558325912733, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6614999771118164, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8702411651611328, + "num_tokens": 382631887.0, + "step": 10024 + }, + { + "epoch": 1.2752830428698638, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5392284393310547, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.871138870716095, + "num_tokens": 382672341.0, + "step": 10025 + }, + { + "epoch": 1.2754102531484544, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5035488605499268, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8761230707168579, + "num_tokens": 382714535.0, + "step": 10026 + }, + { + "epoch": 1.2755374634270449, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5550389289855957, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8585854172706604, + "num_tokens": 382757661.0, + "step": 10027 + }, + { + "epoch": 1.2756646737056354, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6723427772521973, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8659509420394897, + "num_tokens": 382796759.0, + "step": 10028 + }, + { + "epoch": 1.275791883984226, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5215669870376587, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8718419075012207, + "num_tokens": 382835331.0, + "step": 10029 + }, + { + "epoch": 1.2759190942628165, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6623026132583618, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8640201091766357, + "num_tokens": 382872053.0, + "step": 10030 + }, + { + "epoch": 1.276046304541407, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6183654069900513, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8751064538955688, + "num_tokens": 382914215.0, + "step": 10031 + }, + { + "epoch": 1.2761735148199975, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7548140287399292, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.870730996131897, + "num_tokens": 382946945.0, + "step": 10032 + }, + { + "epoch": 1.2763007250985878, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5917948484420776, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8752379417419434, + "num_tokens": 382987659.0, + "step": 10033 + }, + { + "epoch": 1.2764279353771784, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.4463977813720703, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8696728348731995, + "num_tokens": 383032043.0, + "step": 10034 + }, + { + "epoch": 1.2765551456557689, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5721299648284912, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8780712485313416, + "num_tokens": 383069991.0, + "step": 10035 + }, + { + "epoch": 1.2766823559343594, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6589456796646118, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8734208345413208, + "num_tokens": 383104362.0, + "step": 10036 + }, + { + "epoch": 1.27680956621295, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5620003938674927, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8843948841094971, + "num_tokens": 383142760.0, + "step": 10037 + }, + { + "epoch": 1.2769367764915405, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6902527809143066, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8707675337791443, + "num_tokens": 383178255.0, + "step": 10038 + }, + { + "epoch": 1.277063986770131, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5766525268554688, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8705399036407471, + "num_tokens": 383218729.0, + "step": 10039 + }, + { + "epoch": 1.2771911970487215, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6066310405731201, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8638517260551453, + "num_tokens": 383256652.0, + "step": 10040 + }, + { + "epoch": 1.277318407327312, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5729405879974365, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8836626410484314, + "num_tokens": 383295307.0, + "step": 10041 + }, + { + "epoch": 1.2774456176059026, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6958693265914917, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8599188327789307, + "num_tokens": 383332355.0, + "step": 10042 + }, + { + "epoch": 1.2775728278844931, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6186772584915161, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8541188836097717, + "num_tokens": 383375193.0, + "step": 10043 + }, + { + "epoch": 1.2777000381630836, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.686218500137329, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8452208042144775, + "num_tokens": 383414316.0, + "step": 10044 + }, + { + "epoch": 1.2778272484416742, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6658090353012085, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8694484233856201, + "num_tokens": 383450726.0, + "step": 10045 + }, + { + "epoch": 1.2779544587202647, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.763423204421997, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8783880472183228, + "num_tokens": 383482663.0, + "step": 10046 + }, + { + "epoch": 1.2780816689988552, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5470422506332397, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8693684339523315, + "num_tokens": 383522455.0, + "step": 10047 + }, + { + "epoch": 1.2782088792774458, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5740132331848145, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.86038738489151, + "num_tokens": 383561557.0, + "step": 10048 + }, + { + "epoch": 1.278336089556036, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.471129059791565, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8693661093711853, + "num_tokens": 383603409.0, + "step": 10049 + }, + { + "epoch": 1.2784632998346266, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6304712295532227, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8697689175605774, + "num_tokens": 383642095.0, + "step": 10050 + }, + { + "epoch": 1.2785905101132171, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5873380899429321, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8726282119750977, + "num_tokens": 383680975.0, + "step": 10051 + }, + { + "epoch": 1.2787177203918076, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.8176835775375366, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8545182347297668, + "num_tokens": 383714797.0, + "step": 10052 + }, + { + "epoch": 1.2788449306703982, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 2.321127414703369, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8652041554450989, + "num_tokens": 383757208.0, + "step": 10053 + }, + { + "epoch": 1.2789721409489887, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6269402503967285, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8669770956039429, + "num_tokens": 383797981.0, + "step": 10054 + }, + { + "epoch": 1.2790993512275792, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6879175901412964, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8606012463569641, + "num_tokens": 383837006.0, + "step": 10055 + }, + { + "epoch": 1.2792265615061698, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5099146366119385, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8825148344039917, + "num_tokens": 383875092.0, + "step": 10056 + }, + { + "epoch": 1.2793537717847603, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7433850765228271, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8619343042373657, + "num_tokens": 383908552.0, + "step": 10057 + }, + { + "epoch": 1.2794809820633506, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.725189447402954, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8535671234130859, + "num_tokens": 383946021.0, + "step": 10058 + }, + { + "epoch": 1.2796081923419411, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5870674848556519, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8630430102348328, + "num_tokens": 383985562.0, + "step": 10059 + }, + { + "epoch": 1.2797354026205316, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.4677529335021973, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.874524712562561, + "num_tokens": 384028800.0, + "step": 10060 + }, + { + "epoch": 1.2798626128991222, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7363592386245728, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8637170791625977, + "num_tokens": 384059125.0, + "step": 10061 + }, + { + "epoch": 1.2799898231777127, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6297142505645752, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8704350590705872, + "num_tokens": 384093378.0, + "step": 10062 + }, + { + "epoch": 1.2801170334563032, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5216193199157715, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8609100580215454, + "num_tokens": 384139240.0, + "step": 10063 + }, + { + "epoch": 1.2802442437348938, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.8055630922317505, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8650798797607422, + "num_tokens": 384170706.0, + "step": 10064 + }, + { + "epoch": 1.2803714540134843, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6735069751739502, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.874671459197998, + "num_tokens": 384208229.0, + "step": 10065 + }, + { + "epoch": 1.2804986642920748, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6423912048339844, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8772741556167603, + "num_tokens": 384246008.0, + "step": 10066 + }, + { + "epoch": 1.2806258745706653, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.763529896736145, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8747913837432861, + "num_tokens": 384277724.0, + "step": 10067 + }, + { + "epoch": 1.2807530848492559, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5852755308151245, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8789060115814209, + "num_tokens": 384315791.0, + "step": 10068 + }, + { + "epoch": 1.2808802951278464, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.723511815071106, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8446294069290161, + "num_tokens": 384354513.0, + "step": 10069 + }, + { + "epoch": 1.281007505406437, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5445700883865356, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8678706288337708, + "num_tokens": 384397794.0, + "step": 10070 + }, + { + "epoch": 1.2811347156850275, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.8294926881790161, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.854807436466217, + "num_tokens": 384431629.0, + "step": 10071 + }, + { + "epoch": 1.281261925963618, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.4969862699508667, + "learning_rate": 1e-06, + "loss": 0.2823, + "mean_token_accuracy": 0.9066998958587646, + "num_tokens": 384468379.0, + "step": 10072 + }, + { + "epoch": 1.2813891362422083, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6315069198608398, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8693259954452515, + "num_tokens": 384506580.0, + "step": 10073 + }, + { + "epoch": 1.2815163465207988, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6604282855987549, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8727778792381287, + "num_tokens": 384542136.0, + "step": 10074 + }, + { + "epoch": 1.2816435567993893, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.743539571762085, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8574377298355103, + "num_tokens": 384580843.0, + "step": 10075 + }, + { + "epoch": 1.2817707670779799, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.796933650970459, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8635039925575256, + "num_tokens": 384613807.0, + "step": 10076 + }, + { + "epoch": 1.2818979773565704, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5264408588409424, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8612916469573975, + "num_tokens": 384655136.0, + "step": 10077 + }, + { + "epoch": 1.282025187635161, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.627925992012024, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8666226863861084, + "num_tokens": 384692878.0, + "step": 10078 + }, + { + "epoch": 1.2821523979137515, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6679834127426147, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8549855947494507, + "num_tokens": 384732331.0, + "step": 10079 + }, + { + "epoch": 1.282279608192342, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7487332820892334, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.855371356010437, + "num_tokens": 384769239.0, + "step": 10080 + }, + { + "epoch": 1.2824068184709325, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7325035333633423, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8682824969291687, + "num_tokens": 384808239.0, + "step": 10081 + }, + { + "epoch": 1.2825340287495228, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6083441972732544, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8705889582633972, + "num_tokens": 384846961.0, + "step": 10082 + }, + { + "epoch": 1.2826612390281134, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6827625036239624, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8837870955467224, + "num_tokens": 384877356.0, + "step": 10083 + }, + { + "epoch": 1.2827884493067039, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.590368628501892, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.876431405544281, + "num_tokens": 384919878.0, + "step": 10084 + }, + { + "epoch": 1.2829156595852944, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7628755569458008, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8681537508964539, + "num_tokens": 384957907.0, + "step": 10085 + }, + { + "epoch": 1.283042869863885, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6799556016921997, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8756483197212219, + "num_tokens": 384995817.0, + "step": 10086 + }, + { + "epoch": 1.2831700801424755, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5392990112304688, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.884665846824646, + "num_tokens": 385031381.0, + "step": 10087 + }, + { + "epoch": 1.283297290421066, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7055903673171997, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8736019134521484, + "num_tokens": 385067721.0, + "step": 10088 + }, + { + "epoch": 1.2834245006996565, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5864711999893188, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8659248352050781, + "num_tokens": 385108407.0, + "step": 10089 + }, + { + "epoch": 1.283551710978247, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5597976446151733, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8577428460121155, + "num_tokens": 385146427.0, + "step": 10090 + }, + { + "epoch": 1.2836789212568376, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7359371185302734, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8648471832275391, + "num_tokens": 385180311.0, + "step": 10091 + }, + { + "epoch": 1.283806131535428, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5847702026367188, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8639732003211975, + "num_tokens": 385219957.0, + "step": 10092 + }, + { + "epoch": 1.2839333418140186, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7085314989089966, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.860109269618988, + "num_tokens": 385258891.0, + "step": 10093 + }, + { + "epoch": 1.2840605520926092, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5047060251235962, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8771799802780151, + "num_tokens": 385302844.0, + "step": 10094 + }, + { + "epoch": 1.2841877623711997, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.632863998413086, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8742943406105042, + "num_tokens": 385341250.0, + "step": 10095 + }, + { + "epoch": 1.2843149726497902, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5637258291244507, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8753053545951843, + "num_tokens": 385380927.0, + "step": 10096 + }, + { + "epoch": 1.2844421829283807, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.7742958068847656, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8683302402496338, + "num_tokens": 385414829.0, + "step": 10097 + }, + { + "epoch": 1.284569393206971, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6213289499282837, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8745864033699036, + "num_tokens": 385452211.0, + "step": 10098 + }, + { + "epoch": 1.2846966034855616, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6625628471374512, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8600732088088989, + "num_tokens": 385488697.0, + "step": 10099 + }, + { + "epoch": 1.2848238137641521, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5905178785324097, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8763262033462524, + "num_tokens": 385522573.0, + "step": 10100 + }, + { + "epoch": 1.2849510240427426, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5924030542373657, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8669635653495789, + "num_tokens": 385558817.0, + "step": 10101 + }, + { + "epoch": 1.2850782343213332, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.4926272630691528, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8646043539047241, + "num_tokens": 385598371.0, + "step": 10102 + }, + { + "epoch": 1.2852054445999237, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.583340048789978, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8788229823112488, + "num_tokens": 385637748.0, + "step": 10103 + }, + { + "epoch": 1.2853326548785142, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.6238161325454712, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.876436710357666, + "num_tokens": 385674082.0, + "step": 10104 + }, + { + "epoch": 1.2854598651571048, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.5946664810180664, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8669503927230835, + "num_tokens": 385711906.0, + "step": 10105 + }, + { + "epoch": 1.2855870754356953, + "ewc_loss": 2.205371856689453e-05, + "grad_norm": 1.717877984046936, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8720642328262329, + "num_tokens": 385750014.0, + "step": 10106 + }, + { + "epoch": 1.2857142857142856, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6504632234573364, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.867774248123169, + "num_tokens": 385785171.0, + "step": 10107 + }, + { + "epoch": 1.2858414959928761, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6226263046264648, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8591221570968628, + "num_tokens": 385823401.0, + "step": 10108 + }, + { + "epoch": 1.2859687062714666, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6827205419540405, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8483683466911316, + "num_tokens": 385861568.0, + "step": 10109 + }, + { + "epoch": 1.2860959165500572, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5037168264389038, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8896583318710327, + "num_tokens": 385898929.0, + "step": 10110 + }, + { + "epoch": 1.2862231268286477, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.633966326713562, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8751430511474609, + "num_tokens": 385932585.0, + "step": 10111 + }, + { + "epoch": 1.2863503371072382, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6593914031982422, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8633593916893005, + "num_tokens": 385972803.0, + "step": 10112 + }, + { + "epoch": 1.2864775473858288, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.58540678024292, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8609821200370789, + "num_tokens": 386014060.0, + "step": 10113 + }, + { + "epoch": 1.2866047576644193, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.649585247039795, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8686962127685547, + "num_tokens": 386051900.0, + "step": 10114 + }, + { + "epoch": 1.2867319679430098, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7693873643875122, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8761996030807495, + "num_tokens": 386088625.0, + "step": 10115 + }, + { + "epoch": 1.2868591782216003, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.669506311416626, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.87086421251297, + "num_tokens": 386124144.0, + "step": 10116 + }, + { + "epoch": 1.2869863885001909, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7451592683792114, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8808894753456116, + "num_tokens": 386156441.0, + "step": 10117 + }, + { + "epoch": 1.2871135987787814, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.661584734916687, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8716413974761963, + "num_tokens": 386192533.0, + "step": 10118 + }, + { + "epoch": 1.287240809057372, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5721759796142578, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8712283372879028, + "num_tokens": 386231474.0, + "step": 10119 + }, + { + "epoch": 1.2873680193359625, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7846108675003052, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8607048988342285, + "num_tokens": 386266706.0, + "step": 10120 + }, + { + "epoch": 1.287495229614553, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.587709903717041, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8637102842330933, + "num_tokens": 386309067.0, + "step": 10121 + }, + { + "epoch": 1.2876224398931433, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.618101716041565, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8658542633056641, + "num_tokens": 386346704.0, + "step": 10122 + }, + { + "epoch": 1.2877496501717338, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6821329593658447, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8750483989715576, + "num_tokens": 386379647.0, + "step": 10123 + }, + { + "epoch": 1.2878768604503243, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5607298612594604, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8767577409744263, + "num_tokens": 386420825.0, + "step": 10124 + }, + { + "epoch": 1.2880040707289149, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.669176697731018, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8719298839569092, + "num_tokens": 386457077.0, + "step": 10125 + }, + { + "epoch": 1.2881312810075054, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.644258975982666, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8686803579330444, + "num_tokens": 386494602.0, + "step": 10126 + }, + { + "epoch": 1.288258491286096, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.644887089729309, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8628033399581909, + "num_tokens": 386531064.0, + "step": 10127 + }, + { + "epoch": 1.2883857015646865, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.523097038269043, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8781800866127014, + "num_tokens": 386572372.0, + "step": 10128 + }, + { + "epoch": 1.288512911843277, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6639071702957153, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8718153834342957, + "num_tokens": 386607659.0, + "step": 10129 + }, + { + "epoch": 1.2886401221218675, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6817785501480103, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8689916133880615, + "num_tokens": 386641322.0, + "step": 10130 + }, + { + "epoch": 1.2887673324004578, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6941735744476318, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8807096481323242, + "num_tokens": 386675986.0, + "step": 10131 + }, + { + "epoch": 1.2888945426790483, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5798852443695068, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8818408250808716, + "num_tokens": 386713651.0, + "step": 10132 + }, + { + "epoch": 1.2890217529576389, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7581549882888794, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8700456619262695, + "num_tokens": 386745817.0, + "step": 10133 + }, + { + "epoch": 1.2891489632362294, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.65628182888031, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8765633702278137, + "num_tokens": 386776516.0, + "step": 10134 + }, + { + "epoch": 1.28927617351482, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5562164783477783, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8914968967437744, + "num_tokens": 386813100.0, + "step": 10135 + }, + { + "epoch": 1.2894033837934105, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.714065670967102, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8728481531143188, + "num_tokens": 386852083.0, + "step": 10136 + }, + { + "epoch": 1.289530594072001, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6956069469451904, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8634577989578247, + "num_tokens": 386888257.0, + "step": 10137 + }, + { + "epoch": 1.2896578043505915, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6847784519195557, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8631028532981873, + "num_tokens": 386920762.0, + "step": 10138 + }, + { + "epoch": 1.289785014629182, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.630478858947754, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8713158369064331, + "num_tokens": 386958392.0, + "step": 10139 + }, + { + "epoch": 1.2899122249077726, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7940565347671509, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8693050742149353, + "num_tokens": 386991435.0, + "step": 10140 + }, + { + "epoch": 1.290039435186363, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6852248907089233, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8619460463523865, + "num_tokens": 387032043.0, + "step": 10141 + }, + { + "epoch": 1.2901666454649536, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6637133359909058, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8657606840133667, + "num_tokens": 387073764.0, + "step": 10142 + }, + { + "epoch": 1.2902938557435442, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5671991109848022, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8797692060470581, + "num_tokens": 387111563.0, + "step": 10143 + }, + { + "epoch": 1.2904210660221347, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.680898904800415, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8734466433525085, + "num_tokens": 387146077.0, + "step": 10144 + }, + { + "epoch": 1.2905482763007252, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7124813795089722, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8549765944480896, + "num_tokens": 387183320.0, + "step": 10145 + }, + { + "epoch": 1.2906754865793157, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5692068338394165, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8602594137191772, + "num_tokens": 387226571.0, + "step": 10146 + }, + { + "epoch": 1.290802696857906, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.583145260810852, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.867222785949707, + "num_tokens": 387265959.0, + "step": 10147 + }, + { + "epoch": 1.2909299071364966, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5833154916763306, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8703962564468384, + "num_tokens": 387307702.0, + "step": 10148 + }, + { + "epoch": 1.291057117415087, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6582618951797485, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.865431547164917, + "num_tokens": 387346089.0, + "step": 10149 + }, + { + "epoch": 1.2911843276936776, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.563855767250061, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8734800815582275, + "num_tokens": 387388280.0, + "step": 10150 + }, + { + "epoch": 1.2913115379722682, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6527000665664673, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8691885471343994, + "num_tokens": 387426945.0, + "step": 10151 + }, + { + "epoch": 1.2914387482508587, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.597597599029541, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8751339316368103, + "num_tokens": 387468216.0, + "step": 10152 + }, + { + "epoch": 1.2915659585294492, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6084132194519043, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8651726245880127, + "num_tokens": 387512874.0, + "step": 10153 + }, + { + "epoch": 1.2916931688080397, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5560606718063354, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8597465753555298, + "num_tokens": 387552489.0, + "step": 10154 + }, + { + "epoch": 1.2918203790866303, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6946427822113037, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8733023405075073, + "num_tokens": 387585149.0, + "step": 10155 + }, + { + "epoch": 1.2919475893652206, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5871332883834839, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8714240789413452, + "num_tokens": 387622371.0, + "step": 10156 + }, + { + "epoch": 1.2920747996438111, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.636658787727356, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8808668851852417, + "num_tokens": 387658738.0, + "step": 10157 + }, + { + "epoch": 1.2922020099224016, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6734507083892822, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8733525276184082, + "num_tokens": 387693146.0, + "step": 10158 + }, + { + "epoch": 1.2923292202009922, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6697263717651367, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8610349893569946, + "num_tokens": 387732092.0, + "step": 10159 + }, + { + "epoch": 1.2924564304795827, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5561065673828125, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.867647647857666, + "num_tokens": 387770833.0, + "step": 10160 + }, + { + "epoch": 1.2925836407581732, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7677849531173706, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8594412207603455, + "num_tokens": 387805638.0, + "step": 10161 + }, + { + "epoch": 1.2927108510367638, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.8979936838150024, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.847291886806488, + "num_tokens": 387837935.0, + "step": 10162 + }, + { + "epoch": 1.2928380613153543, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5454933643341064, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8709080219268799, + "num_tokens": 387878763.0, + "step": 10163 + }, + { + "epoch": 1.2929652715939448, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.499853491783142, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8600041270256042, + "num_tokens": 387923936.0, + "step": 10164 + }, + { + "epoch": 1.2930924818725353, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5952223539352417, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8642883896827698, + "num_tokens": 387965675.0, + "step": 10165 + }, + { + "epoch": 1.2932196921511259, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6245445013046265, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8577489256858826, + "num_tokens": 388004162.0, + "step": 10166 + }, + { + "epoch": 1.2933469024297164, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5984259843826294, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8790875673294067, + "num_tokens": 388043223.0, + "step": 10167 + }, + { + "epoch": 1.293474112708307, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.658785104751587, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8595821857452393, + "num_tokens": 388080528.0, + "step": 10168 + }, + { + "epoch": 1.2936013229868975, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5219725370407104, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8773024082183838, + "num_tokens": 388120112.0, + "step": 10169 + }, + { + "epoch": 1.293728533265488, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6272783279418945, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8675980567932129, + "num_tokens": 388158318.0, + "step": 10170 + }, + { + "epoch": 1.2938557435440783, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6245144605636597, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8650389909744263, + "num_tokens": 388196683.0, + "step": 10171 + }, + { + "epoch": 1.2939829538226688, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7041041851043701, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8768398761749268, + "num_tokens": 388235156.0, + "step": 10172 + }, + { + "epoch": 1.2941101641012593, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5650813579559326, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8705594539642334, + "num_tokens": 388274810.0, + "step": 10173 + }, + { + "epoch": 1.2942373743798499, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.55315101146698, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8709701299667358, + "num_tokens": 388314496.0, + "step": 10174 + }, + { + "epoch": 1.2943645846584404, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.4692693948745728, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8765418529510498, + "num_tokens": 388355047.0, + "step": 10175 + }, + { + "epoch": 1.294491794937031, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6758816242218018, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8675019145011902, + "num_tokens": 388392011.0, + "step": 10176 + }, + { + "epoch": 1.2946190052156215, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6707255840301514, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8459321856498718, + "num_tokens": 388432549.0, + "step": 10177 + }, + { + "epoch": 1.294746215494212, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6735268831253052, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8737474083900452, + "num_tokens": 388468528.0, + "step": 10178 + }, + { + "epoch": 1.2948734257728025, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6618422269821167, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8726649284362793, + "num_tokens": 388503722.0, + "step": 10179 + }, + { + "epoch": 1.2950006360513928, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5828742980957031, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8725377917289734, + "num_tokens": 388544652.0, + "step": 10180 + }, + { + "epoch": 1.2951278463299833, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.967952013015747, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8397181630134583, + "num_tokens": 388580444.0, + "step": 10181 + }, + { + "epoch": 1.2952550566085739, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7026740312576294, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8675737380981445, + "num_tokens": 388620380.0, + "step": 10182 + }, + { + "epoch": 1.2953822668871644, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6312271356582642, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8556565046310425, + "num_tokens": 388661440.0, + "step": 10183 + }, + { + "epoch": 1.295509477165755, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6067003011703491, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.865283727645874, + "num_tokens": 388701201.0, + "step": 10184 + }, + { + "epoch": 1.2956366874443455, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5729656219482422, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8637007474899292, + "num_tokens": 388746168.0, + "step": 10185 + }, + { + "epoch": 1.295763897722936, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7127691507339478, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8771518468856812, + "num_tokens": 388782203.0, + "step": 10186 + }, + { + "epoch": 1.2958911080015265, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6208428144454956, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8728760480880737, + "num_tokens": 388821613.0, + "step": 10187 + }, + { + "epoch": 1.296018318280117, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6636207103729248, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8748385906219482, + "num_tokens": 388860885.0, + "step": 10188 + }, + { + "epoch": 1.2961455285587076, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.493254542350769, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8673766255378723, + "num_tokens": 388903267.0, + "step": 10189 + }, + { + "epoch": 1.296272738837298, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5460892915725708, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.872454047203064, + "num_tokens": 388941346.0, + "step": 10190 + }, + { + "epoch": 1.2963999491158886, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6702889204025269, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8784106373786926, + "num_tokens": 388975219.0, + "step": 10191 + }, + { + "epoch": 1.2965271593944792, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.8718777894973755, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8616091012954712, + "num_tokens": 389011984.0, + "step": 10192 + }, + { + "epoch": 1.2966543696730697, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7491259574890137, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.870144248008728, + "num_tokens": 389051134.0, + "step": 10193 + }, + { + "epoch": 1.2967815799516602, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6741117238998413, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8754564523696899, + "num_tokens": 389086140.0, + "step": 10194 + }, + { + "epoch": 1.2969087902302507, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.728214144706726, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8695698380470276, + "num_tokens": 389119515.0, + "step": 10195 + }, + { + "epoch": 1.297036000508841, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7699291706085205, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8642171621322632, + "num_tokens": 389153713.0, + "step": 10196 + }, + { + "epoch": 1.2971632107874316, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.644922137260437, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8538947105407715, + "num_tokens": 389193118.0, + "step": 10197 + }, + { + "epoch": 1.297290421066022, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7423352003097534, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.847281277179718, + "num_tokens": 389229911.0, + "step": 10198 + }, + { + "epoch": 1.2974176313446126, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.642608404159546, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8701429963111877, + "num_tokens": 389267160.0, + "step": 10199 + }, + { + "epoch": 1.2975448416232032, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.939224362373352, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.851952314376831, + "num_tokens": 389299024.0, + "step": 10200 + }, + { + "epoch": 1.2976720519017937, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.609073519706726, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8792492151260376, + "num_tokens": 389337864.0, + "step": 10201 + }, + { + "epoch": 1.2977992621803842, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.8178454637527466, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8592809438705444, + "num_tokens": 389374920.0, + "step": 10202 + }, + { + "epoch": 1.2979264724589747, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6594655513763428, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8743135929107666, + "num_tokens": 389412843.0, + "step": 10203 + }, + { + "epoch": 1.2980536827375653, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6789565086364746, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.884711742401123, + "num_tokens": 389444268.0, + "step": 10204 + }, + { + "epoch": 1.2981808930161556, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6734375953674316, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8752610683441162, + "num_tokens": 389477798.0, + "step": 10205 + }, + { + "epoch": 1.298308103294746, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5950320959091187, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8581596612930298, + "num_tokens": 389518968.0, + "step": 10206 + }, + { + "epoch": 1.2984353135733366, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.731674313545227, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8594610095024109, + "num_tokens": 389554823.0, + "step": 10207 + }, + { + "epoch": 1.2985625238519272, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7368346452713013, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8683297634124756, + "num_tokens": 389588651.0, + "step": 10208 + }, + { + "epoch": 1.2986897341305177, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5832712650299072, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8650665283203125, + "num_tokens": 389629685.0, + "step": 10209 + }, + { + "epoch": 1.2988169444091082, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7148629426956177, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8463333249092102, + "num_tokens": 389667334.0, + "step": 10210 + }, + { + "epoch": 1.2989441546876987, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.59521484375, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8744471073150635, + "num_tokens": 389705871.0, + "step": 10211 + }, + { + "epoch": 1.2990713649662893, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6409921646118164, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8817112445831299, + "num_tokens": 389740603.0, + "step": 10212 + }, + { + "epoch": 1.2991985752448798, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5791975259780884, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8813486099243164, + "num_tokens": 389779875.0, + "step": 10213 + }, + { + "epoch": 1.2993257855234703, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6219884157180786, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8601884841918945, + "num_tokens": 389827427.0, + "step": 10214 + }, + { + "epoch": 1.2994529958020609, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7274408340454102, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8615732192993164, + "num_tokens": 389868860.0, + "step": 10215 + }, + { + "epoch": 1.2995802060806514, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.4924060106277466, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8789792060852051, + "num_tokens": 389909639.0, + "step": 10216 + }, + { + "epoch": 1.299707416359242, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6163595914840698, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8810091018676758, + "num_tokens": 389944387.0, + "step": 10217 + }, + { + "epoch": 1.2998346266378324, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.80039644241333, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8680710792541504, + "num_tokens": 389977363.0, + "step": 10218 + }, + { + "epoch": 1.299961836916423, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6335299015045166, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.868989884853363, + "num_tokens": 390015660.0, + "step": 10219 + }, + { + "epoch": 1.3000890471950133, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6191716194152832, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8646771907806396, + "num_tokens": 390053682.0, + "step": 10220 + }, + { + "epoch": 1.3002162574736038, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5024863481521606, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.877708911895752, + "num_tokens": 390093506.0, + "step": 10221 + }, + { + "epoch": 1.3003434677521943, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5917284488677979, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8819729089736938, + "num_tokens": 390127688.0, + "step": 10222 + }, + { + "epoch": 1.3004706780307849, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.460552453994751, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8776722550392151, + "num_tokens": 390170254.0, + "step": 10223 + }, + { + "epoch": 1.3005978883093754, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.540511965751648, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8632169961929321, + "num_tokens": 390214746.0, + "step": 10224 + }, + { + "epoch": 1.300725098587966, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5548186302185059, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8818016052246094, + "num_tokens": 390249131.0, + "step": 10225 + }, + { + "epoch": 1.3008523088665565, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6808282136917114, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8721188902854919, + "num_tokens": 390292266.0, + "step": 10226 + }, + { + "epoch": 1.300979519145147, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.542755365371704, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8681161999702454, + "num_tokens": 390335642.0, + "step": 10227 + }, + { + "epoch": 1.3011067294237375, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6756609678268433, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8750974535942078, + "num_tokens": 390370754.0, + "step": 10228 + }, + { + "epoch": 1.3012339397023278, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5057809352874756, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8698719143867493, + "num_tokens": 390416434.0, + "step": 10229 + }, + { + "epoch": 1.3013611499809183, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7879456281661987, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8647488355636597, + "num_tokens": 390450756.0, + "step": 10230 + }, + { + "epoch": 1.3014883602595089, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5846047401428223, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8757807612419128, + "num_tokens": 390493961.0, + "step": 10231 + }, + { + "epoch": 1.3016155705380994, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5491600036621094, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8733312487602234, + "num_tokens": 390539075.0, + "step": 10232 + }, + { + "epoch": 1.30174278081669, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.652047872543335, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8653843998908997, + "num_tokens": 390578851.0, + "step": 10233 + }, + { + "epoch": 1.3018699910952805, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.613631248474121, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8690163493156433, + "num_tokens": 390615151.0, + "step": 10234 + }, + { + "epoch": 1.301997201373871, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6310194730758667, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8647671937942505, + "num_tokens": 390657848.0, + "step": 10235 + }, + { + "epoch": 1.3021244116524615, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6588845252990723, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8782347440719604, + "num_tokens": 390694776.0, + "step": 10236 + }, + { + "epoch": 1.302251621931052, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7367726564407349, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8744335174560547, + "num_tokens": 390728859.0, + "step": 10237 + }, + { + "epoch": 1.3023788322096426, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.553687334060669, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8761494159698486, + "num_tokens": 390769197.0, + "step": 10238 + }, + { + "epoch": 1.302506042488233, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5482264757156372, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8676907420158386, + "num_tokens": 390807847.0, + "step": 10239 + }, + { + "epoch": 1.3026332527668236, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7601611614227295, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8476806282997131, + "num_tokens": 390843653.0, + "step": 10240 + }, + { + "epoch": 1.3027604630454142, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5261346101760864, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8749219179153442, + "num_tokens": 390883853.0, + "step": 10241 + }, + { + "epoch": 1.3028876733240047, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5065791606903076, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8719967603683472, + "num_tokens": 390925093.0, + "step": 10242 + }, + { + "epoch": 1.3030148836025952, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.699988842010498, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8663917183876038, + "num_tokens": 390957908.0, + "step": 10243 + }, + { + "epoch": 1.3031420938811857, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5922114849090576, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8632683157920837, + "num_tokens": 390995843.0, + "step": 10244 + }, + { + "epoch": 1.303269304159776, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.646423101425171, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8828600645065308, + "num_tokens": 391029282.0, + "step": 10245 + }, + { + "epoch": 1.3033965144383666, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6523133516311646, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8670786023139954, + "num_tokens": 391067126.0, + "step": 10246 + }, + { + "epoch": 1.303523724716957, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6792314052581787, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8672704696655273, + "num_tokens": 391103761.0, + "step": 10247 + }, + { + "epoch": 1.3036509349955476, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7981539964675903, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8750926852226257, + "num_tokens": 391144724.0, + "step": 10248 + }, + { + "epoch": 1.3037781452741382, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6916528940200806, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8719302415847778, + "num_tokens": 391179544.0, + "step": 10249 + }, + { + "epoch": 1.3039053555527287, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.633461833000183, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.882114589214325, + "num_tokens": 391215132.0, + "step": 10250 + }, + { + "epoch": 1.3040325658313192, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.673021674156189, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8562924265861511, + "num_tokens": 391253161.0, + "step": 10251 + }, + { + "epoch": 1.3041597761099097, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.9411427974700928, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8506500720977783, + "num_tokens": 391282523.0, + "step": 10252 + }, + { + "epoch": 1.3042869863885003, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7542355060577393, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.876174807548523, + "num_tokens": 391314134.0, + "step": 10253 + }, + { + "epoch": 1.3044141966670906, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6124902963638306, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8646271824836731, + "num_tokens": 391352301.0, + "step": 10254 + }, + { + "epoch": 1.304541406945681, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6962472200393677, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8681448698043823, + "num_tokens": 391390851.0, + "step": 10255 + }, + { + "epoch": 1.3046686172242716, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5505648851394653, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8674237132072449, + "num_tokens": 391433278.0, + "step": 10256 + }, + { + "epoch": 1.3047958275028622, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5950051546096802, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8674124479293823, + "num_tokens": 391473366.0, + "step": 10257 + }, + { + "epoch": 1.3049230377814527, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5359572172164917, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8698804378509521, + "num_tokens": 391513419.0, + "step": 10258 + }, + { + "epoch": 1.3050502480600432, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6513782739639282, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8631951808929443, + "num_tokens": 391553161.0, + "step": 10259 + }, + { + "epoch": 1.3051774583386337, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6543117761611938, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8580558896064758, + "num_tokens": 391591167.0, + "step": 10260 + }, + { + "epoch": 1.3053046686172243, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6358498334884644, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8688384294509888, + "num_tokens": 391631480.0, + "step": 10261 + }, + { + "epoch": 1.3054318788958148, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6373622417449951, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8685901165008545, + "num_tokens": 391667670.0, + "step": 10262 + }, + { + "epoch": 1.3055590891744053, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.66627836227417, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8676416873931885, + "num_tokens": 391706236.0, + "step": 10263 + }, + { + "epoch": 1.3056862994529959, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.633538007736206, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8689019083976746, + "num_tokens": 391742916.0, + "step": 10264 + }, + { + "epoch": 1.3058135097315864, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5944794416427612, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8715319633483887, + "num_tokens": 391779038.0, + "step": 10265 + }, + { + "epoch": 1.305940720010177, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5331957340240479, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8787320852279663, + "num_tokens": 391817937.0, + "step": 10266 + }, + { + "epoch": 1.3060679302887674, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5265623331069946, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.867011308670044, + "num_tokens": 391858775.0, + "step": 10267 + }, + { + "epoch": 1.306195140567358, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5111972093582153, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8811554908752441, + "num_tokens": 391897344.0, + "step": 10268 + }, + { + "epoch": 1.3063223508459483, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7136961221694946, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8682794570922852, + "num_tokens": 391931997.0, + "step": 10269 + }, + { + "epoch": 1.3064495611245388, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6247200965881348, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8601168394088745, + "num_tokens": 391968299.0, + "step": 10270 + }, + { + "epoch": 1.3065767714031293, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6084227561950684, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8525739908218384, + "num_tokens": 392005498.0, + "step": 10271 + }, + { + "epoch": 1.3067039816817199, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6285837888717651, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8702314496040344, + "num_tokens": 392041103.0, + "step": 10272 + }, + { + "epoch": 1.3068311919603104, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5878793001174927, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8669951558113098, + "num_tokens": 392076642.0, + "step": 10273 + }, + { + "epoch": 1.306958402238901, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.4741588830947876, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8685970306396484, + "num_tokens": 392120778.0, + "step": 10274 + }, + { + "epoch": 1.3070856125174914, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6224197149276733, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.875562846660614, + "num_tokens": 392154382.0, + "step": 10275 + }, + { + "epoch": 1.307212822796082, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5731585025787354, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8691970705986023, + "num_tokens": 392191424.0, + "step": 10276 + }, + { + "epoch": 1.3073400330746725, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.624635100364685, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8771276473999023, + "num_tokens": 392226030.0, + "step": 10277 + }, + { + "epoch": 1.3074672433532628, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7226481437683105, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8743849396705627, + "num_tokens": 392266915.0, + "step": 10278 + }, + { + "epoch": 1.3075944536318533, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5946801900863647, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.863289475440979, + "num_tokens": 392306656.0, + "step": 10279 + }, + { + "epoch": 1.3077216639104439, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.523465871810913, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8836963176727295, + "num_tokens": 392349229.0, + "step": 10280 + }, + { + "epoch": 1.3078488741890344, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.820562720298767, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8592458963394165, + "num_tokens": 392385052.0, + "step": 10281 + }, + { + "epoch": 1.307976084467625, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6392260789871216, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8509454131126404, + "num_tokens": 392425183.0, + "step": 10282 + }, + { + "epoch": 1.3081032947462155, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5862758159637451, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8516899347305298, + "num_tokens": 392465766.0, + "step": 10283 + }, + { + "epoch": 1.308230505024806, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7866828441619873, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8527717590332031, + "num_tokens": 392496692.0, + "step": 10284 + }, + { + "epoch": 1.3083577153033965, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7450847625732422, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8701317310333252, + "num_tokens": 392529365.0, + "step": 10285 + }, + { + "epoch": 1.308484925581987, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.6665631532669067, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8517457246780396, + "num_tokens": 392569651.0, + "step": 10286 + }, + { + "epoch": 1.3086121358605776, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5905390977859497, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8646736145019531, + "num_tokens": 392609991.0, + "step": 10287 + }, + { + "epoch": 1.308739346139168, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7397898435592651, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8693718910217285, + "num_tokens": 392644199.0, + "step": 10288 + }, + { + "epoch": 1.3088665564177586, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.752193570137024, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8644527792930603, + "num_tokens": 392678508.0, + "step": 10289 + }, + { + "epoch": 1.3089937666963491, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.564611554145813, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8594446182250977, + "num_tokens": 392724073.0, + "step": 10290 + }, + { + "epoch": 1.3091209769749397, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6308749914169312, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8756779432296753, + "num_tokens": 392755274.0, + "step": 10291 + }, + { + "epoch": 1.3092481872535302, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6595224142074585, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8645402193069458, + "num_tokens": 392793828.0, + "step": 10292 + }, + { + "epoch": 1.3093753975321207, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.7589372396469116, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8567185997962952, + "num_tokens": 392827913.0, + "step": 10293 + }, + { + "epoch": 1.309502607810711, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.613982915878296, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8705291748046875, + "num_tokens": 392866045.0, + "step": 10294 + }, + { + "epoch": 1.3096298180893016, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6113747358322144, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.873185396194458, + "num_tokens": 392902881.0, + "step": 10295 + }, + { + "epoch": 1.309757028367892, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5549136400222778, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8690940141677856, + "num_tokens": 392942948.0, + "step": 10296 + }, + { + "epoch": 1.3098842386464826, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.8536933660507202, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8467952013015747, + "num_tokens": 392977463.0, + "step": 10297 + }, + { + "epoch": 1.3100114489250732, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6258918046951294, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8644496202468872, + "num_tokens": 393014686.0, + "step": 10298 + }, + { + "epoch": 1.3101386592036637, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.64283287525177, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8638062477111816, + "num_tokens": 393052236.0, + "step": 10299 + }, + { + "epoch": 1.3102658694822542, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7216745615005493, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8757096529006958, + "num_tokens": 393087367.0, + "step": 10300 + }, + { + "epoch": 1.3103930797608447, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6342039108276367, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8613259792327881, + "num_tokens": 393130039.0, + "step": 10301 + }, + { + "epoch": 1.3105202900394353, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6196469068527222, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8797537088394165, + "num_tokens": 393165736.0, + "step": 10302 + }, + { + "epoch": 1.3106475003180256, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.55048668384552, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8729062080383301, + "num_tokens": 393207650.0, + "step": 10303 + }, + { + "epoch": 1.310774710596616, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5838249921798706, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8828395009040833, + "num_tokens": 393243874.0, + "step": 10304 + }, + { + "epoch": 1.3109019208752066, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5207685232162476, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8715947270393372, + "num_tokens": 393286119.0, + "step": 10305 + }, + { + "epoch": 1.3110291311537972, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6369760036468506, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8723852634429932, + "num_tokens": 393320213.0, + "step": 10306 + }, + { + "epoch": 1.3111563414323877, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.684913158416748, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8662505149841309, + "num_tokens": 393358001.0, + "step": 10307 + }, + { + "epoch": 1.3112835517109782, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5930030345916748, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8599538803100586, + "num_tokens": 393399878.0, + "step": 10308 + }, + { + "epoch": 1.3114107619895687, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5026721954345703, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8758206963539124, + "num_tokens": 393444675.0, + "step": 10309 + }, + { + "epoch": 1.3115379722681593, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.4483158588409424, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8745039701461792, + "num_tokens": 393488883.0, + "step": 10310 + }, + { + "epoch": 1.3116651825467498, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.758650302886963, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8631370067596436, + "num_tokens": 393527841.0, + "step": 10311 + }, + { + "epoch": 1.3117923928253403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6731846332550049, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8644423484802246, + "num_tokens": 393564160.0, + "step": 10312 + }, + { + "epoch": 1.3119196031039309, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.69265878200531, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.859785795211792, + "num_tokens": 393604687.0, + "step": 10313 + }, + { + "epoch": 1.3120468133825214, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6222468614578247, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8702768087387085, + "num_tokens": 393640529.0, + "step": 10314 + }, + { + "epoch": 1.312174023661112, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6215224266052246, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8617320656776428, + "num_tokens": 393680663.0, + "step": 10315 + }, + { + "epoch": 1.3123012339397024, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.770921230316162, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.861302375793457, + "num_tokens": 393714830.0, + "step": 10316 + }, + { + "epoch": 1.312428444218293, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 3.679081439971924, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8980026245117188, + "num_tokens": 393754302.0, + "step": 10317 + }, + { + "epoch": 1.3125556544968833, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5824064016342163, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8697341084480286, + "num_tokens": 393797066.0, + "step": 10318 + }, + { + "epoch": 1.3126828647754738, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7454943656921387, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8730937838554382, + "num_tokens": 393836293.0, + "step": 10319 + }, + { + "epoch": 1.3128100750540643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5675418376922607, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8705297708511353, + "num_tokens": 393880457.0, + "step": 10320 + }, + { + "epoch": 1.3129372853326549, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5278599262237549, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8713139295578003, + "num_tokens": 393923413.0, + "step": 10321 + }, + { + "epoch": 1.3130644956112454, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.663598656654358, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8736346960067749, + "num_tokens": 393959222.0, + "step": 10322 + }, + { + "epoch": 1.313191705889836, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7152974605560303, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8680323362350464, + "num_tokens": 393997477.0, + "step": 10323 + }, + { + "epoch": 1.3133189161684264, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6982710361480713, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8744054436683655, + "num_tokens": 394030807.0, + "step": 10324 + }, + { + "epoch": 1.313446126447017, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.8115090131759644, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8706934452056885, + "num_tokens": 394062933.0, + "step": 10325 + }, + { + "epoch": 1.3135733367256075, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5899276733398438, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.862402617931366, + "num_tokens": 394106469.0, + "step": 10326 + }, + { + "epoch": 1.3137005470041978, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7959328889846802, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.854179859161377, + "num_tokens": 394144100.0, + "step": 10327 + }, + { + "epoch": 1.3138277572827883, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6398859024047852, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8769431114196777, + "num_tokens": 394178733.0, + "step": 10328 + }, + { + "epoch": 1.3139549675613789, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6431492567062378, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8668851256370544, + "num_tokens": 394220359.0, + "step": 10329 + }, + { + "epoch": 1.3140821778399694, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5626320838928223, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8733912706375122, + "num_tokens": 394262860.0, + "step": 10330 + }, + { + "epoch": 1.31420938811856, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5639276504516602, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8529041409492493, + "num_tokens": 394307281.0, + "step": 10331 + }, + { + "epoch": 1.3143365983971504, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5189299583435059, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8786778450012207, + "num_tokens": 394345874.0, + "step": 10332 + }, + { + "epoch": 1.314463808675741, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5413326025009155, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.870728075504303, + "num_tokens": 394388867.0, + "step": 10333 + }, + { + "epoch": 1.3145910189543315, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5571539402008057, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8743358850479126, + "num_tokens": 394427436.0, + "step": 10334 + }, + { + "epoch": 1.314718229232922, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.62290358543396, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.872494637966156, + "num_tokens": 394464258.0, + "step": 10335 + }, + { + "epoch": 1.3148454395115126, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6232136487960815, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8712560534477234, + "num_tokens": 394500266.0, + "step": 10336 + }, + { + "epoch": 1.314972649790103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.4565588235855103, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8654329776763916, + "num_tokens": 394548082.0, + "step": 10337 + }, + { + "epoch": 1.3150998600686936, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5052663087844849, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8679392337799072, + "num_tokens": 394593570.0, + "step": 10338 + }, + { + "epoch": 1.3152270703472841, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6332589387893677, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8681366443634033, + "num_tokens": 394634821.0, + "step": 10339 + }, + { + "epoch": 1.3153542806258747, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.531076192855835, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8694338798522949, + "num_tokens": 394674600.0, + "step": 10340 + }, + { + "epoch": 1.3154814909044652, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5297355651855469, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8729453086853027, + "num_tokens": 394712118.0, + "step": 10341 + }, + { + "epoch": 1.3156087011830557, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.63129460811615, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8649497032165527, + "num_tokens": 394750876.0, + "step": 10342 + }, + { + "epoch": 1.315735911461646, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5427050590515137, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8724265098571777, + "num_tokens": 394791149.0, + "step": 10343 + }, + { + "epoch": 1.3158631217402366, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.614471197128296, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8779373168945312, + "num_tokens": 394827694.0, + "step": 10344 + }, + { + "epoch": 1.315990332018827, + "ewc_loss": 2.2172927856445312e-05, + "grad_norm": 1.5536879301071167, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8656867742538452, + "num_tokens": 394867598.0, + "step": 10345 + }, + { + "epoch": 1.3161175422974176, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6662075519561768, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8827577829360962, + "num_tokens": 394899453.0, + "step": 10346 + }, + { + "epoch": 1.3162447525760081, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.577713966369629, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8675249814987183, + "num_tokens": 394941343.0, + "step": 10347 + }, + { + "epoch": 1.3163719628545987, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5718921422958374, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8644570708274841, + "num_tokens": 394981955.0, + "step": 10348 + }, + { + "epoch": 1.3164991731331892, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6971410512924194, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8594425916671753, + "num_tokens": 395019012.0, + "step": 10349 + }, + { + "epoch": 1.3166263834117797, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7907062768936157, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8618887066841125, + "num_tokens": 395051768.0, + "step": 10350 + }, + { + "epoch": 1.3167535936903703, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6067657470703125, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8660399317741394, + "num_tokens": 395092475.0, + "step": 10351 + }, + { + "epoch": 1.3168808039689606, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6841360330581665, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8657227754592896, + "num_tokens": 395128787.0, + "step": 10352 + }, + { + "epoch": 1.317008014247551, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6440143585205078, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8641718626022339, + "num_tokens": 395166002.0, + "step": 10353 + }, + { + "epoch": 1.3171352245261416, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6702038049697876, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8592202663421631, + "num_tokens": 395202690.0, + "step": 10354 + }, + { + "epoch": 1.3172624348047322, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5595529079437256, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8727253079414368, + "num_tokens": 395242196.0, + "step": 10355 + }, + { + "epoch": 1.3173896450833227, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6596730947494507, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8652563095092773, + "num_tokens": 395278677.0, + "step": 10356 + }, + { + "epoch": 1.3175168553619132, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5673727989196777, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8906808495521545, + "num_tokens": 395315229.0, + "step": 10357 + }, + { + "epoch": 1.3176440656405037, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.636423110961914, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8740997314453125, + "num_tokens": 395355771.0, + "step": 10358 + }, + { + "epoch": 1.3177712759190943, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.658923864364624, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8710041046142578, + "num_tokens": 395390842.0, + "step": 10359 + }, + { + "epoch": 1.3178984861976848, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.736523151397705, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8836835622787476, + "num_tokens": 395425698.0, + "step": 10360 + }, + { + "epoch": 1.3180256964762753, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.8000810146331787, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8578159809112549, + "num_tokens": 395462171.0, + "step": 10361 + }, + { + "epoch": 1.3181529067548658, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6639058589935303, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8708765506744385, + "num_tokens": 395500232.0, + "step": 10362 + }, + { + "epoch": 1.3182801170334564, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.632902979850769, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8657960891723633, + "num_tokens": 395539079.0, + "step": 10363 + }, + { + "epoch": 1.318407327312047, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6960148811340332, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.870660662651062, + "num_tokens": 395573273.0, + "step": 10364 + }, + { + "epoch": 1.3185345375906374, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.8282800912857056, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8653569221496582, + "num_tokens": 395605100.0, + "step": 10365 + }, + { + "epoch": 1.318661747869228, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5915517807006836, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8517091274261475, + "num_tokens": 395648374.0, + "step": 10366 + }, + { + "epoch": 1.3187889581478183, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6149194240570068, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8611037135124207, + "num_tokens": 395691841.0, + "step": 10367 + }, + { + "epoch": 1.3189161684264088, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6778743267059326, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8572486639022827, + "num_tokens": 395727895.0, + "step": 10368 + }, + { + "epoch": 1.3190433787049993, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.637133002281189, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8679074048995972, + "num_tokens": 395766467.0, + "step": 10369 + }, + { + "epoch": 1.3191705889835899, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6504415273666382, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8765556812286377, + "num_tokens": 395806163.0, + "step": 10370 + }, + { + "epoch": 1.3192977992621804, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.578630805015564, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8693050146102905, + "num_tokens": 395844526.0, + "step": 10371 + }, + { + "epoch": 1.319425009540771, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6578259468078613, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8843646049499512, + "num_tokens": 395881291.0, + "step": 10372 + }, + { + "epoch": 1.3195522198193614, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5458142757415771, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8694974780082703, + "num_tokens": 395926086.0, + "step": 10373 + }, + { + "epoch": 1.319679430097952, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5156632661819458, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8854942321777344, + "num_tokens": 395961184.0, + "step": 10374 + }, + { + "epoch": 1.3198066403765425, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5009340047836304, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8847825527191162, + "num_tokens": 396001796.0, + "step": 10375 + }, + { + "epoch": 1.3199338506551328, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5922393798828125, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8773015141487122, + "num_tokens": 396042618.0, + "step": 10376 + }, + { + "epoch": 1.3200610609337233, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6762858629226685, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.85826575756073, + "num_tokens": 396081263.0, + "step": 10377 + }, + { + "epoch": 1.3201882712123139, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6298444271087646, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8632695078849792, + "num_tokens": 396119325.0, + "step": 10378 + }, + { + "epoch": 1.3203154814909044, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.8405622243881226, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8714821338653564, + "num_tokens": 396148578.0, + "step": 10379 + }, + { + "epoch": 1.320442691769495, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6219689846038818, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8676807880401611, + "num_tokens": 396185851.0, + "step": 10380 + }, + { + "epoch": 1.3205699020480854, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.502638816833496, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8705968856811523, + "num_tokens": 396229458.0, + "step": 10381 + }, + { + "epoch": 1.320697112326676, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6817506551742554, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.865465521812439, + "num_tokens": 396265007.0, + "step": 10382 + }, + { + "epoch": 1.3208243226052665, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.4655240774154663, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8870837688446045, + "num_tokens": 396304456.0, + "step": 10383 + }, + { + "epoch": 1.320951532883857, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5890618562698364, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8584339022636414, + "num_tokens": 396346521.0, + "step": 10384 + }, + { + "epoch": 1.3210787431624476, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6088346242904663, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8663387298583984, + "num_tokens": 396383130.0, + "step": 10385 + }, + { + "epoch": 1.321205953441038, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.588670253753662, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8695824146270752, + "num_tokens": 396418858.0, + "step": 10386 + }, + { + "epoch": 1.3213331637196286, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5752849578857422, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8611140251159668, + "num_tokens": 396458568.0, + "step": 10387 + }, + { + "epoch": 1.3214603739982191, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5637271404266357, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8877854943275452, + "num_tokens": 396493868.0, + "step": 10388 + }, + { + "epoch": 1.3215875842768097, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6889135837554932, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8552013635635376, + "num_tokens": 396530172.0, + "step": 10389 + }, + { + "epoch": 1.3217147945554002, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6644948720932007, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8705118298530579, + "num_tokens": 396566946.0, + "step": 10390 + }, + { + "epoch": 1.3218420048339907, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7354074716567993, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8493665456771851, + "num_tokens": 396601644.0, + "step": 10391 + }, + { + "epoch": 1.321969215112581, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7682145833969116, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8603556156158447, + "num_tokens": 396637538.0, + "step": 10392 + }, + { + "epoch": 1.3220964253911716, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.4982168674468994, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8787891864776611, + "num_tokens": 396680721.0, + "step": 10393 + }, + { + "epoch": 1.322223635669762, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.4677684307098389, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8854943513870239, + "num_tokens": 396723769.0, + "step": 10394 + }, + { + "epoch": 1.3223508459483526, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7385700941085815, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.855151355266571, + "num_tokens": 396757772.0, + "step": 10395 + }, + { + "epoch": 1.3224780562269431, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5582669973373413, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8769979476928711, + "num_tokens": 396800095.0, + "step": 10396 + }, + { + "epoch": 1.3226052665055337, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5572962760925293, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8639384508132935, + "num_tokens": 396841281.0, + "step": 10397 + }, + { + "epoch": 1.3227324767841242, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6176729202270508, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8595261573791504, + "num_tokens": 396880980.0, + "step": 10398 + }, + { + "epoch": 1.3228596870627147, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5172070264816284, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.88255774974823, + "num_tokens": 396919773.0, + "step": 10399 + }, + { + "epoch": 1.3229868973413053, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7397156953811646, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8678244352340698, + "num_tokens": 396957946.0, + "step": 10400 + }, + { + "epoch": 1.3231141076198956, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.5307120084762573, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8709544539451599, + "num_tokens": 396999937.0, + "step": 10401 + }, + { + "epoch": 1.323241317898486, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6463099718093872, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8555266261100769, + "num_tokens": 397036339.0, + "step": 10402 + }, + { + "epoch": 1.3233685281770766, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.549963355064392, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8571036458015442, + "num_tokens": 397078874.0, + "step": 10403 + }, + { + "epoch": 1.3234957384556671, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6283236742019653, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8751261234283447, + "num_tokens": 397114568.0, + "step": 10404 + }, + { + "epoch": 1.3236229487342577, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5619319677352905, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8830574154853821, + "num_tokens": 397154978.0, + "step": 10405 + }, + { + "epoch": 1.3237501590128482, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6378982067108154, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8662947416305542, + "num_tokens": 397193793.0, + "step": 10406 + }, + { + "epoch": 1.3238773692914387, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.536534070968628, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8658086061477661, + "num_tokens": 397237469.0, + "step": 10407 + }, + { + "epoch": 1.3240045795700293, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7398219108581543, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8414463996887207, + "num_tokens": 397275726.0, + "step": 10408 + }, + { + "epoch": 1.3241317898486198, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.545642375946045, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8780770301818848, + "num_tokens": 397313902.0, + "step": 10409 + }, + { + "epoch": 1.3242590001272103, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.717180609703064, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8711607456207275, + "num_tokens": 397354250.0, + "step": 10410 + }, + { + "epoch": 1.3243862104058008, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5757323503494263, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8821093440055847, + "num_tokens": 397393173.0, + "step": 10411 + }, + { + "epoch": 1.3245134206843914, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6001979112625122, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.886815071105957, + "num_tokens": 397426308.0, + "step": 10412 + }, + { + "epoch": 1.324640630962982, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.5721197128295898, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.855360746383667, + "num_tokens": 397471548.0, + "step": 10413 + }, + { + "epoch": 1.3247678412415724, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5750774145126343, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8686072826385498, + "num_tokens": 397515236.0, + "step": 10414 + }, + { + "epoch": 1.324895051520163, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6476024389266968, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.866945743560791, + "num_tokens": 397554129.0, + "step": 10415 + }, + { + "epoch": 1.3250222617987533, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7049777507781982, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8532000780105591, + "num_tokens": 397591323.0, + "step": 10416 + }, + { + "epoch": 1.3251494720773438, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5825722217559814, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8645036220550537, + "num_tokens": 397629280.0, + "step": 10417 + }, + { + "epoch": 1.3252766823559343, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6703063249588013, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8732268810272217, + "num_tokens": 397663251.0, + "step": 10418 + }, + { + "epoch": 1.3254038926345248, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.735575795173645, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8731156587600708, + "num_tokens": 397696332.0, + "step": 10419 + }, + { + "epoch": 1.3255311029131154, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5741868019104004, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8824935555458069, + "num_tokens": 397735948.0, + "step": 10420 + }, + { + "epoch": 1.325658313191706, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6692029237747192, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8606554269790649, + "num_tokens": 397771218.0, + "step": 10421 + }, + { + "epoch": 1.3257855234702964, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6099172830581665, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8560950756072998, + "num_tokens": 397813210.0, + "step": 10422 + }, + { + "epoch": 1.325912733748887, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.4872596263885498, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8821938037872314, + "num_tokens": 397853683.0, + "step": 10423 + }, + { + "epoch": 1.3260399440274775, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.710603952407837, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8591102957725525, + "num_tokens": 397888060.0, + "step": 10424 + }, + { + "epoch": 1.3261671543060678, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.625543236732483, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8555363416671753, + "num_tokens": 397926727.0, + "step": 10425 + }, + { + "epoch": 1.3262943645846583, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5250600576400757, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8622257709503174, + "num_tokens": 397967877.0, + "step": 10426 + }, + { + "epoch": 1.3264215748632489, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5932362079620361, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8766602873802185, + "num_tokens": 398011763.0, + "step": 10427 + }, + { + "epoch": 1.3265487851418394, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5705426931381226, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8723107576370239, + "num_tokens": 398049800.0, + "step": 10428 + }, + { + "epoch": 1.32667599542043, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5835461616516113, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8457177877426147, + "num_tokens": 398095243.0, + "step": 10429 + }, + { + "epoch": 1.3268032056990204, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6129018068313599, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8849165439605713, + "num_tokens": 398129761.0, + "step": 10430 + }, + { + "epoch": 1.326930415977611, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6162378787994385, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8800385594367981, + "num_tokens": 398166122.0, + "step": 10431 + }, + { + "epoch": 1.3270576262562015, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6704192161560059, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8628527522087097, + "num_tokens": 398203465.0, + "step": 10432 + }, + { + "epoch": 1.327184836534792, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.718779444694519, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.866453230381012, + "num_tokens": 398234988.0, + "step": 10433 + }, + { + "epoch": 1.3273120468133826, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5875259637832642, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8788602352142334, + "num_tokens": 398269594.0, + "step": 10434 + }, + { + "epoch": 1.327439257091973, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7377053499221802, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8716963529586792, + "num_tokens": 398305162.0, + "step": 10435 + }, + { + "epoch": 1.3275664673705636, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.665700078010559, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8647657632827759, + "num_tokens": 398342151.0, + "step": 10436 + }, + { + "epoch": 1.3276936776491541, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.6023736000061035, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8623481392860413, + "num_tokens": 398382428.0, + "step": 10437 + }, + { + "epoch": 1.3278208879277447, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5739468336105347, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8691117167472839, + "num_tokens": 398419289.0, + "step": 10438 + }, + { + "epoch": 1.3279480982063352, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.697845458984375, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8657368421554565, + "num_tokens": 398461415.0, + "step": 10439 + }, + { + "epoch": 1.3280753084849257, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5937780141830444, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8853915929794312, + "num_tokens": 398499620.0, + "step": 10440 + }, + { + "epoch": 1.328202518763516, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.8549822568893433, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8665359020233154, + "num_tokens": 398529225.0, + "step": 10441 + }, + { + "epoch": 1.3283297290421066, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.5861141681671143, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8645880818367004, + "num_tokens": 398574242.0, + "step": 10442 + }, + { + "epoch": 1.328456939320697, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7060095071792603, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8528810143470764, + "num_tokens": 398614514.0, + "step": 10443 + }, + { + "epoch": 1.3285841495992876, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.614925503730774, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8676832318305969, + "num_tokens": 398653087.0, + "step": 10444 + }, + { + "epoch": 1.3287113598778781, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.532606840133667, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8899347186088562, + "num_tokens": 398690781.0, + "step": 10445 + }, + { + "epoch": 1.3288385701564687, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.713165044784546, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.86841881275177, + "num_tokens": 398727229.0, + "step": 10446 + }, + { + "epoch": 1.3289657804350592, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.8115460872650146, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8701781034469604, + "num_tokens": 398760992.0, + "step": 10447 + }, + { + "epoch": 1.3290929907136497, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.526499629020691, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8772702217102051, + "num_tokens": 398802452.0, + "step": 10448 + }, + { + "epoch": 1.3292202009922403, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6193852424621582, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8722891211509705, + "num_tokens": 398838717.0, + "step": 10449 + }, + { + "epoch": 1.3293474112708306, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.574588656425476, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8519394397735596, + "num_tokens": 398881919.0, + "step": 10450 + }, + { + "epoch": 1.329474621549421, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.7123080492019653, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8503260612487793, + "num_tokens": 398919066.0, + "step": 10451 + }, + { + "epoch": 1.3296018318280116, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.6902670860290527, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8805967569351196, + "num_tokens": 398950324.0, + "step": 10452 + }, + { + "epoch": 1.3297290421066021, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5381115674972534, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8779590725898743, + "num_tokens": 398991228.0, + "step": 10453 + }, + { + "epoch": 1.3298562523851927, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5844560861587524, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8714747428894043, + "num_tokens": 399030413.0, + "step": 10454 + }, + { + "epoch": 1.3299834626637832, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6222046613693237, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8707505464553833, + "num_tokens": 399067282.0, + "step": 10455 + }, + { + "epoch": 1.3301106729423737, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7340656518936157, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8637421131134033, + "num_tokens": 399104276.0, + "step": 10456 + }, + { + "epoch": 1.3302378832209643, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6852165460586548, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8710463047027588, + "num_tokens": 399137809.0, + "step": 10457 + }, + { + "epoch": 1.3303650934995548, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6705175638198853, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8495774269104004, + "num_tokens": 399176041.0, + "step": 10458 + }, + { + "epoch": 1.3304923037781453, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.5908252000808716, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8722890019416809, + "num_tokens": 399215305.0, + "step": 10459 + }, + { + "epoch": 1.3306195140567358, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.8615723848342896, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8590233325958252, + "num_tokens": 399248550.0, + "step": 10460 + }, + { + "epoch": 1.3307467243353264, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.642164945602417, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8599138855934143, + "num_tokens": 399284932.0, + "step": 10461 + }, + { + "epoch": 1.330873934613917, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.8512959480285645, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.88533616065979, + "num_tokens": 399314605.0, + "step": 10462 + }, + { + "epoch": 1.3310011448925074, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6135048866271973, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8839569091796875, + "num_tokens": 399351794.0, + "step": 10463 + }, + { + "epoch": 1.331128355171098, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.637097716331482, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8570984601974487, + "num_tokens": 399389765.0, + "step": 10464 + }, + { + "epoch": 1.3312555654496883, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.4889662265777588, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8772475719451904, + "num_tokens": 399429435.0, + "step": 10465 + }, + { + "epoch": 1.3313827757282788, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5975303649902344, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8803460001945496, + "num_tokens": 399464056.0, + "step": 10466 + }, + { + "epoch": 1.3315099860068693, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.4701814651489258, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8744623064994812, + "num_tokens": 399504015.0, + "step": 10467 + }, + { + "epoch": 1.3316371962854598, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.6757431030273438, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8549435138702393, + "num_tokens": 399540849.0, + "step": 10468 + }, + { + "epoch": 1.3317644065640504, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5594274997711182, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8737131357192993, + "num_tokens": 399580175.0, + "step": 10469 + }, + { + "epoch": 1.331891616842641, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6443358659744263, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8616626858711243, + "num_tokens": 399616160.0, + "step": 10470 + }, + { + "epoch": 1.3320188271212314, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5553436279296875, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.868455171585083, + "num_tokens": 399660071.0, + "step": 10471 + }, + { + "epoch": 1.332146037399822, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6220701932907104, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8682460188865662, + "num_tokens": 399695593.0, + "step": 10472 + }, + { + "epoch": 1.3322732476784125, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6992532014846802, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8828426003456116, + "num_tokens": 399730001.0, + "step": 10473 + }, + { + "epoch": 1.3324004579570028, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.5246098041534424, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.866698145866394, + "num_tokens": 399769123.0, + "step": 10474 + }, + { + "epoch": 1.3325276682355933, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.5515364408493042, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8431165814399719, + "num_tokens": 399815134.0, + "step": 10475 + }, + { + "epoch": 1.3326548785141838, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.7147445678710938, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.872225821018219, + "num_tokens": 399849865.0, + "step": 10476 + }, + { + "epoch": 1.3327820887927744, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.8719524145126343, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.861967921257019, + "num_tokens": 399883825.0, + "step": 10477 + }, + { + "epoch": 1.332909299071365, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.5820565223693848, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8666484355926514, + "num_tokens": 399921156.0, + "step": 10478 + }, + { + "epoch": 1.3330365093499554, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.8454216718673706, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.867072343826294, + "num_tokens": 399963078.0, + "step": 10479 + }, + { + "epoch": 1.333163719628546, + "ewc_loss": 2.2530555725097656e-05, + "grad_norm": 1.541506052017212, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8828625679016113, + "num_tokens": 399999948.0, + "step": 10480 + }, + { + "epoch": 1.3332909299071365, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.6974362134933472, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8504444360733032, + "num_tokens": 400039374.0, + "step": 10481 + }, + { + "epoch": 1.333418140185727, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.691824197769165, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8427750468254089, + "num_tokens": 400081608.0, + "step": 10482 + }, + { + "epoch": 1.3335453504643175, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.8252801895141602, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8636268377304077, + "num_tokens": 400112925.0, + "step": 10483 + }, + { + "epoch": 1.333672560742908, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5239596366882324, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8760677576065063, + "num_tokens": 400151353.0, + "step": 10484 + }, + { + "epoch": 1.3337997710214986, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.591339111328125, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.860763430595398, + "num_tokens": 400189391.0, + "step": 10485 + }, + { + "epoch": 1.3339269813000891, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5314222574234009, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8753207325935364, + "num_tokens": 400229411.0, + "step": 10486 + }, + { + "epoch": 1.3340541915786797, + "ewc_loss": 2.2411346435546875e-05, + "grad_norm": 1.648058295249939, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8514211773872375, + "num_tokens": 400266267.0, + "step": 10487 + }, + { + "epoch": 1.3341814018572702, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5738180875778198, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8758369088172913, + "num_tokens": 400304672.0, + "step": 10488 + }, + { + "epoch": 1.3343086121358605, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6828420162200928, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8657373189926147, + "num_tokens": 400337958.0, + "step": 10489 + }, + { + "epoch": 1.334435822414451, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.507802128791809, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8952062129974365, + "num_tokens": 400373450.0, + "step": 10490 + }, + { + "epoch": 1.3345630326930416, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.66751229763031, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8685513734817505, + "num_tokens": 400407338.0, + "step": 10491 + }, + { + "epoch": 1.334690242971632, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.663386583328247, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8719077706336975, + "num_tokens": 400441766.0, + "step": 10492 + }, + { + "epoch": 1.3348174532502226, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6797648668289185, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8622066974639893, + "num_tokens": 400478366.0, + "step": 10493 + }, + { + "epoch": 1.3349446635288131, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.601198673248291, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8812066316604614, + "num_tokens": 400517412.0, + "step": 10494 + }, + { + "epoch": 1.3350718738074037, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.570664882659912, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.882055401802063, + "num_tokens": 400553105.0, + "step": 10495 + }, + { + "epoch": 1.3351990840859942, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7187176942825317, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8715009689331055, + "num_tokens": 400588809.0, + "step": 10496 + }, + { + "epoch": 1.3353262943645847, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5718953609466553, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8725250959396362, + "num_tokens": 400627881.0, + "step": 10497 + }, + { + "epoch": 1.3354535046431752, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6847331523895264, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8771498799324036, + "num_tokens": 400662047.0, + "step": 10498 + }, + { + "epoch": 1.3355807149217656, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5086663961410522, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8727182149887085, + "num_tokens": 400707975.0, + "step": 10499 + }, + { + "epoch": 1.335707925200356, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.604978322982788, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8742228746414185, + "num_tokens": 400746227.0, + "step": 10500 + }, + { + "epoch": 1.3358351354789466, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5978912115097046, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8747876882553101, + "num_tokens": 400786144.0, + "step": 10501 + }, + { + "epoch": 1.3359623457575371, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.578401803970337, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8752847909927368, + "num_tokens": 400828035.0, + "step": 10502 + }, + { + "epoch": 1.3360895560361277, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7632803916931152, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.868832528591156, + "num_tokens": 400865428.0, + "step": 10503 + }, + { + "epoch": 1.3362167663147182, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5198323726654053, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8669240474700928, + "num_tokens": 400907801.0, + "step": 10504 + }, + { + "epoch": 1.3363439765933087, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6553943157196045, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.863909900188446, + "num_tokens": 400945745.0, + "step": 10505 + }, + { + "epoch": 1.3364711868718993, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7790539264678955, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.872195839881897, + "num_tokens": 400980889.0, + "step": 10506 + }, + { + "epoch": 1.3365983971504898, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5449943542480469, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8749104142189026, + "num_tokens": 401020476.0, + "step": 10507 + }, + { + "epoch": 1.3367256074290803, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5977569818496704, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8720387816429138, + "num_tokens": 401058030.0, + "step": 10508 + }, + { + "epoch": 1.3368528177076708, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6854561567306519, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8652256727218628, + "num_tokens": 401093083.0, + "step": 10509 + }, + { + "epoch": 1.3369800279862614, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6332710981369019, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.867569088935852, + "num_tokens": 401129927.0, + "step": 10510 + }, + { + "epoch": 1.337107238264852, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6389374732971191, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8671151399612427, + "num_tokens": 401168984.0, + "step": 10511 + }, + { + "epoch": 1.3372344485434424, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.718507170677185, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8673551082611084, + "num_tokens": 401204772.0, + "step": 10512 + }, + { + "epoch": 1.337361658822033, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.574150562286377, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8780820369720459, + "num_tokens": 401243160.0, + "step": 10513 + }, + { + "epoch": 1.3374888691006233, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6521351337432861, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8748374581336975, + "num_tokens": 401282440.0, + "step": 10514 + }, + { + "epoch": 1.3376160793792138, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.627109169960022, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8634347915649414, + "num_tokens": 401317755.0, + "step": 10515 + }, + { + "epoch": 1.3377432896578043, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7312202453613281, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.847030758857727, + "num_tokens": 401358374.0, + "step": 10516 + }, + { + "epoch": 1.3378704999363948, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.659564733505249, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8609785437583923, + "num_tokens": 401398544.0, + "step": 10517 + }, + { + "epoch": 1.3379977102149854, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5877496004104614, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8760359287261963, + "num_tokens": 401433434.0, + "step": 10518 + }, + { + "epoch": 1.338124920493576, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5571221113204956, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8489329218864441, + "num_tokens": 401475484.0, + "step": 10519 + }, + { + "epoch": 1.3382521307721664, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5851706266403198, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8669297695159912, + "num_tokens": 401513491.0, + "step": 10520 + }, + { + "epoch": 1.338379341050757, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6806819438934326, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8677358627319336, + "num_tokens": 401546394.0, + "step": 10521 + }, + { + "epoch": 1.3385065513293475, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.4920556545257568, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8797571659088135, + "num_tokens": 401587815.0, + "step": 10522 + }, + { + "epoch": 1.3386337616079378, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5975490808486938, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8803889751434326, + "num_tokens": 401631282.0, + "step": 10523 + }, + { + "epoch": 1.3387609718865283, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.627475380897522, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8641523718833923, + "num_tokens": 401672180.0, + "step": 10524 + }, + { + "epoch": 1.3388881821651188, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.693677306175232, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.864964485168457, + "num_tokens": 401704818.0, + "step": 10525 + }, + { + "epoch": 1.3390153924437094, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5575264692306519, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8635671734809875, + "num_tokens": 401746956.0, + "step": 10526 + }, + { + "epoch": 1.3391426027223, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7082531452178955, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8682847023010254, + "num_tokens": 401782578.0, + "step": 10527 + }, + { + "epoch": 1.3392698130008904, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6570196151733398, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8730354309082031, + "num_tokens": 401817362.0, + "step": 10528 + }, + { + "epoch": 1.339397023279481, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5711098909378052, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8659733533859253, + "num_tokens": 401858131.0, + "step": 10529 + }, + { + "epoch": 1.3395242335580715, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7071785926818848, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8368057012557983, + "num_tokens": 401897255.0, + "step": 10530 + }, + { + "epoch": 1.339651443836662, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6416640281677246, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.864842414855957, + "num_tokens": 401938768.0, + "step": 10531 + }, + { + "epoch": 1.3397786541152525, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.549740195274353, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.877412736415863, + "num_tokens": 401979887.0, + "step": 10532 + }, + { + "epoch": 1.339905864393843, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.595300555229187, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8656734228134155, + "num_tokens": 402018526.0, + "step": 10533 + }, + { + "epoch": 1.3400330746724336, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6329402923583984, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8594825267791748, + "num_tokens": 402056304.0, + "step": 10534 + }, + { + "epoch": 1.3401602849510241, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.717702031135559, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8713356256484985, + "num_tokens": 402094487.0, + "step": 10535 + }, + { + "epoch": 1.3402874952296147, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6528706550598145, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.881312906742096, + "num_tokens": 402128222.0, + "step": 10536 + }, + { + "epoch": 1.3404147055082052, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.4806610345840454, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8821554780006409, + "num_tokens": 402169468.0, + "step": 10537 + }, + { + "epoch": 1.3405419157867955, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.584714651107788, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8680649995803833, + "num_tokens": 402213736.0, + "step": 10538 + }, + { + "epoch": 1.340669126065386, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7756023406982422, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8582769632339478, + "num_tokens": 402246931.0, + "step": 10539 + }, + { + "epoch": 1.3407963363439765, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.4846994876861572, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8816339373588562, + "num_tokens": 402290712.0, + "step": 10540 + }, + { + "epoch": 1.340923546622567, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5398967266082764, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8748548030853271, + "num_tokens": 402329933.0, + "step": 10541 + }, + { + "epoch": 1.3410507569011576, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.9244120121002197, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8636878728866577, + "num_tokens": 402360216.0, + "step": 10542 + }, + { + "epoch": 1.3411779671797481, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7500941753387451, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8769705295562744, + "num_tokens": 402391704.0, + "step": 10543 + }, + { + "epoch": 1.3413051774583387, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.800095796585083, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8716082572937012, + "num_tokens": 402426456.0, + "step": 10544 + }, + { + "epoch": 1.3414323877369292, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.655764102935791, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8846913576126099, + "num_tokens": 402462289.0, + "step": 10545 + }, + { + "epoch": 1.3415595980155197, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7039114236831665, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8570252060890198, + "num_tokens": 402499670.0, + "step": 10546 + }, + { + "epoch": 1.3416868082941102, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6442064046859741, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8747055530548096, + "num_tokens": 402537687.0, + "step": 10547 + }, + { + "epoch": 1.3418140185727006, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7536911964416504, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8655081987380981, + "num_tokens": 402569222.0, + "step": 10548 + }, + { + "epoch": 1.341941228851291, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6990821361541748, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8807517290115356, + "num_tokens": 402607079.0, + "step": 10549 + }, + { + "epoch": 1.3420684391298816, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.8155766725540161, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8715263605117798, + "num_tokens": 402641750.0, + "step": 10550 + }, + { + "epoch": 1.3421956494084721, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5489208698272705, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.864382803440094, + "num_tokens": 402684952.0, + "step": 10551 + }, + { + "epoch": 1.3423228596870627, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5630972385406494, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8641766905784607, + "num_tokens": 402724961.0, + "step": 10552 + }, + { + "epoch": 1.3424500699656532, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5775556564331055, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8702259659767151, + "num_tokens": 402763375.0, + "step": 10553 + }, + { + "epoch": 1.3425772802442437, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.593774437904358, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8617011904716492, + "num_tokens": 402807287.0, + "step": 10554 + }, + { + "epoch": 1.3427044905228342, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5067486763000488, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8795869946479797, + "num_tokens": 402849065.0, + "step": 10555 + }, + { + "epoch": 1.3428317008014248, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6697683334350586, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8759884834289551, + "num_tokens": 402887120.0, + "step": 10556 + }, + { + "epoch": 1.3429589110800153, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.8069345951080322, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8758693933486938, + "num_tokens": 402923653.0, + "step": 10557 + }, + { + "epoch": 1.3430861213586058, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5576255321502686, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8666751980781555, + "num_tokens": 402964834.0, + "step": 10558 + }, + { + "epoch": 1.3432133316371964, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5618096590042114, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8775231242179871, + "num_tokens": 403003206.0, + "step": 10559 + }, + { + "epoch": 1.343340541915787, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5856430530548096, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8631676435470581, + "num_tokens": 403044200.0, + "step": 10560 + }, + { + "epoch": 1.3434677521943774, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6325689554214478, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8612321615219116, + "num_tokens": 403083580.0, + "step": 10561 + }, + { + "epoch": 1.343594962472968, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.595611572265625, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8603554964065552, + "num_tokens": 403127417.0, + "step": 10562 + }, + { + "epoch": 1.3437221727515583, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6078602075576782, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8587955832481384, + "num_tokens": 403168563.0, + "step": 10563 + }, + { + "epoch": 1.3438493830301488, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7639480829238892, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8646789193153381, + "num_tokens": 403201964.0, + "step": 10564 + }, + { + "epoch": 1.3439765933087393, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6725140810012817, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8661186099052429, + "num_tokens": 403237786.0, + "step": 10565 + }, + { + "epoch": 1.3441038035873298, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7434879541397095, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8515991568565369, + "num_tokens": 403272583.0, + "step": 10566 + }, + { + "epoch": 1.3442310138659204, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6497937440872192, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8717228174209595, + "num_tokens": 403307056.0, + "step": 10567 + }, + { + "epoch": 1.344358224144511, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6916109323501587, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8814266920089722, + "num_tokens": 403343613.0, + "step": 10568 + }, + { + "epoch": 1.3444854344231014, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5996981859207153, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8656526207923889, + "num_tokens": 403380449.0, + "step": 10569 + }, + { + "epoch": 1.344612644701692, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.548473596572876, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8666024804115295, + "num_tokens": 403421229.0, + "step": 10570 + }, + { + "epoch": 1.3447398549802825, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6472934484481812, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8789534568786621, + "num_tokens": 403454951.0, + "step": 10571 + }, + { + "epoch": 1.3448670652588728, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5729583501815796, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8798028826713562, + "num_tokens": 403492614.0, + "step": 10572 + }, + { + "epoch": 1.3449942755374633, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6231807470321655, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8776837587356567, + "num_tokens": 403528974.0, + "step": 10573 + }, + { + "epoch": 1.3451214858160538, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7001513242721558, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8743311166763306, + "num_tokens": 403566113.0, + "step": 10574 + }, + { + "epoch": 1.3452486960946444, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5464173555374146, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8698511123657227, + "num_tokens": 403606414.0, + "step": 10575 + }, + { + "epoch": 1.345375906373235, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5749709606170654, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8812496662139893, + "num_tokens": 403644519.0, + "step": 10576 + }, + { + "epoch": 1.3455031166518254, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5981450080871582, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8787986040115356, + "num_tokens": 403685608.0, + "step": 10577 + }, + { + "epoch": 1.345630326930416, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.556187629699707, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8753980994224548, + "num_tokens": 403726297.0, + "step": 10578 + }, + { + "epoch": 1.3457575372090065, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6308445930480957, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8627720475196838, + "num_tokens": 403762814.0, + "step": 10579 + }, + { + "epoch": 1.345884747487597, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7647998332977295, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8841358423233032, + "num_tokens": 403794139.0, + "step": 10580 + }, + { + "epoch": 1.3460119577661875, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5001939535140991, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8746124505996704, + "num_tokens": 403835533.0, + "step": 10581 + }, + { + "epoch": 1.346139168044778, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6806169748306274, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.867672860622406, + "num_tokens": 403869146.0, + "step": 10582 + }, + { + "epoch": 1.3462663783233686, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5720784664154053, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8692715764045715, + "num_tokens": 403909617.0, + "step": 10583 + }, + { + "epoch": 1.3463935886019591, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5511454343795776, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8801697492599487, + "num_tokens": 403949788.0, + "step": 10584 + }, + { + "epoch": 1.3465207988805497, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.8532856702804565, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.874822199344635, + "num_tokens": 403984323.0, + "step": 10585 + }, + { + "epoch": 1.3466480091591402, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5783872604370117, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8754562139511108, + "num_tokens": 404022748.0, + "step": 10586 + }, + { + "epoch": 1.3467752194377305, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6275993585586548, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8628824949264526, + "num_tokens": 404060690.0, + "step": 10587 + }, + { + "epoch": 1.346902429716321, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.618232250213623, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8720691204071045, + "num_tokens": 404100093.0, + "step": 10588 + }, + { + "epoch": 1.3470296399949115, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6446359157562256, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.888966977596283, + "num_tokens": 404133175.0, + "step": 10589 + }, + { + "epoch": 1.347156850273502, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7647424936294556, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8572852611541748, + "num_tokens": 404167125.0, + "step": 10590 + }, + { + "epoch": 1.3472840605520926, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5755008459091187, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8736192584037781, + "num_tokens": 404206347.0, + "step": 10591 + }, + { + "epoch": 1.3474112708306831, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6785005331039429, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8727794289588928, + "num_tokens": 404240669.0, + "step": 10592 + }, + { + "epoch": 1.3475384811092737, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6347579956054688, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8861222863197327, + "num_tokens": 404275936.0, + "step": 10593 + }, + { + "epoch": 1.3476656913878642, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6797761917114258, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.870905876159668, + "num_tokens": 404312663.0, + "step": 10594 + }, + { + "epoch": 1.3477929016664547, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7061243057250977, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8643420934677124, + "num_tokens": 404349041.0, + "step": 10595 + }, + { + "epoch": 1.3479201119450452, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5845638513565063, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.876113772392273, + "num_tokens": 404386497.0, + "step": 10596 + }, + { + "epoch": 1.3480473222236355, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5790250301361084, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8515751361846924, + "num_tokens": 404428110.0, + "step": 10597 + }, + { + "epoch": 1.348174532502226, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6048047542572021, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8867754936218262, + "num_tokens": 404460617.0, + "step": 10598 + }, + { + "epoch": 1.3483017427808166, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6659940481185913, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8777875304222107, + "num_tokens": 404494170.0, + "step": 10599 + }, + { + "epoch": 1.3484289530594071, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5918257236480713, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8809952735900879, + "num_tokens": 404530117.0, + "step": 10600 + }, + { + "epoch": 1.3485561633379977, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.521589756011963, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8700866103172302, + "num_tokens": 404574145.0, + "step": 10601 + }, + { + "epoch": 1.3486833736165882, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6213386058807373, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8777661919593811, + "num_tokens": 404611586.0, + "step": 10602 + }, + { + "epoch": 1.3488105838951787, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5969403982162476, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8730130791664124, + "num_tokens": 404651894.0, + "step": 10603 + }, + { + "epoch": 1.3489377941737692, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.578859567642212, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8825325965881348, + "num_tokens": 404692056.0, + "step": 10604 + }, + { + "epoch": 1.3490650044523598, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6434110403060913, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8805578947067261, + "num_tokens": 404725506.0, + "step": 10605 + }, + { + "epoch": 1.3491922147309503, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6156145334243774, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8731579184532166, + "num_tokens": 404763612.0, + "step": 10606 + }, + { + "epoch": 1.3493194250095408, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 3.737996816635132, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8791068196296692, + "num_tokens": 404804346.0, + "step": 10607 + }, + { + "epoch": 1.3494466352881314, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.8692865371704102, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8589293956756592, + "num_tokens": 404836669.0, + "step": 10608 + }, + { + "epoch": 1.3495738455667219, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6232563257217407, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8698342442512512, + "num_tokens": 404876392.0, + "step": 10609 + }, + { + "epoch": 1.3497010558453124, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.4056107997894287, + "learning_rate": 1e-06, + "loss": 0.3208, + "mean_token_accuracy": 0.8960514068603516, + "num_tokens": 404919154.0, + "step": 10610 + }, + { + "epoch": 1.349828266123903, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6321120262145996, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.851525068283081, + "num_tokens": 404961867.0, + "step": 10611 + }, + { + "epoch": 1.3499554764024932, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5021039247512817, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8688220977783203, + "num_tokens": 405000453.0, + "step": 10612 + }, + { + "epoch": 1.3500826866810838, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.4742494821548462, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8698439002037048, + "num_tokens": 405039859.0, + "step": 10613 + }, + { + "epoch": 1.3502098969596743, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.627246379852295, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8538713455200195, + "num_tokens": 405077849.0, + "step": 10614 + }, + { + "epoch": 1.3503371072382648, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7597620487213135, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8579959273338318, + "num_tokens": 405115885.0, + "step": 10615 + }, + { + "epoch": 1.3504643175168554, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5655301809310913, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8897658586502075, + "num_tokens": 405154374.0, + "step": 10616 + }, + { + "epoch": 1.350591527795446, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6740139722824097, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.876876175403595, + "num_tokens": 405187369.0, + "step": 10617 + }, + { + "epoch": 1.3507187380740364, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.649404764175415, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8674296736717224, + "num_tokens": 405229211.0, + "step": 10618 + }, + { + "epoch": 1.350845948352627, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.5176118612289429, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8870052099227905, + "num_tokens": 405266356.0, + "step": 10619 + }, + { + "epoch": 1.3509731586312175, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.7886689901351929, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8556474447250366, + "num_tokens": 405300996.0, + "step": 10620 + }, + { + "epoch": 1.3511003689098078, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.625138521194458, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.873879611492157, + "num_tokens": 405339349.0, + "step": 10621 + }, + { + "epoch": 1.3512275791883983, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6724095344543457, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.879570722579956, + "num_tokens": 405374090.0, + "step": 10622 + }, + { + "epoch": 1.3513547894669888, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6568013429641724, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8696274757385254, + "num_tokens": 405412513.0, + "step": 10623 + }, + { + "epoch": 1.3514819997455794, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.750264048576355, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8532776832580566, + "num_tokens": 405446284.0, + "step": 10624 + }, + { + "epoch": 1.35160921002417, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6304680109024048, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8721139430999756, + "num_tokens": 405484928.0, + "step": 10625 + }, + { + "epoch": 1.3517364203027604, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6897940635681152, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8679372072219849, + "num_tokens": 405518765.0, + "step": 10626 + }, + { + "epoch": 1.351863630581351, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.6607589721679688, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.869433581829071, + "num_tokens": 405556174.0, + "step": 10627 + }, + { + "epoch": 1.3519908408599415, + "ewc_loss": 2.2649765014648438e-05, + "grad_norm": 1.561166524887085, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8697496652603149, + "num_tokens": 405596643.0, + "step": 10628 + }, + { + "epoch": 1.352118051138532, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.581676721572876, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.865893542766571, + "num_tokens": 405639027.0, + "step": 10629 + }, + { + "epoch": 1.3522452614171225, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7240688800811768, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8645570278167725, + "num_tokens": 405677444.0, + "step": 10630 + }, + { + "epoch": 1.352372471695713, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6633793115615845, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8744234442710876, + "num_tokens": 405716173.0, + "step": 10631 + }, + { + "epoch": 1.3524996819743036, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6166150569915771, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.867499828338623, + "num_tokens": 405749937.0, + "step": 10632 + }, + { + "epoch": 1.3526268922528941, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6035710573196411, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8692086338996887, + "num_tokens": 405789892.0, + "step": 10633 + }, + { + "epoch": 1.3527541025314846, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5913859605789185, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8776347041130066, + "num_tokens": 405830401.0, + "step": 10634 + }, + { + "epoch": 1.3528813128100752, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.4920083284378052, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.877753734588623, + "num_tokens": 405872877.0, + "step": 10635 + }, + { + "epoch": 1.3530085230886655, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6478753089904785, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.871704638004303, + "num_tokens": 405909696.0, + "step": 10636 + }, + { + "epoch": 1.353135733367256, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6568995714187622, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.874144434928894, + "num_tokens": 405949315.0, + "step": 10637 + }, + { + "epoch": 1.3532629436458465, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.633495569229126, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8685228228569031, + "num_tokens": 405990572.0, + "step": 10638 + }, + { + "epoch": 1.353390153924437, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5967652797698975, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8606008291244507, + "num_tokens": 406035658.0, + "step": 10639 + }, + { + "epoch": 1.3535173642030276, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5153180360794067, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8716683387756348, + "num_tokens": 406081713.0, + "step": 10640 + }, + { + "epoch": 1.3536445744816181, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6535555124282837, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8610857129096985, + "num_tokens": 406117084.0, + "step": 10641 + }, + { + "epoch": 1.3537717847602087, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.8350409269332886, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8597784042358398, + "num_tokens": 406150265.0, + "step": 10642 + }, + { + "epoch": 1.3538989950387992, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5687458515167236, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8881025314331055, + "num_tokens": 406189679.0, + "step": 10643 + }, + { + "epoch": 1.3540262053173897, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5913448333740234, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8598929047584534, + "num_tokens": 406231657.0, + "step": 10644 + }, + { + "epoch": 1.3541534155959802, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5778353214263916, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8736595511436462, + "num_tokens": 406270897.0, + "step": 10645 + }, + { + "epoch": 1.3542806258745705, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6103159189224243, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8787698745727539, + "num_tokens": 406310592.0, + "step": 10646 + }, + { + "epoch": 1.354407836153161, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.8327832221984863, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8561457395553589, + "num_tokens": 406344816.0, + "step": 10647 + }, + { + "epoch": 1.3545350464317516, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6349241733551025, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.877327561378479, + "num_tokens": 406387086.0, + "step": 10648 + }, + { + "epoch": 1.3546622567103421, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.86406672000885, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8633775115013123, + "num_tokens": 406421820.0, + "step": 10649 + }, + { + "epoch": 1.3547894669889327, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6044390201568604, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8779518604278564, + "num_tokens": 406457981.0, + "step": 10650 + }, + { + "epoch": 1.3549166772675232, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.4610438346862793, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8888072967529297, + "num_tokens": 406498733.0, + "step": 10651 + }, + { + "epoch": 1.3550438875461137, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6701326370239258, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.870335578918457, + "num_tokens": 406538627.0, + "step": 10652 + }, + { + "epoch": 1.3551710978247042, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5659736394882202, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8713687658309937, + "num_tokens": 406579778.0, + "step": 10653 + }, + { + "epoch": 1.3552983081032948, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5798254013061523, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8777846097946167, + "num_tokens": 406618286.0, + "step": 10654 + }, + { + "epoch": 1.3554255183818853, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6506413221359253, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8634358644485474, + "num_tokens": 406657615.0, + "step": 10655 + }, + { + "epoch": 1.3555527286604758, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7438377141952515, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8629589676856995, + "num_tokens": 406692809.0, + "step": 10656 + }, + { + "epoch": 1.3556799389390664, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.682255506515503, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8628072142601013, + "num_tokens": 406727792.0, + "step": 10657 + }, + { + "epoch": 1.3558071492176569, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5339802503585815, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8769460320472717, + "num_tokens": 406766310.0, + "step": 10658 + }, + { + "epoch": 1.3559343594962474, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5262349843978882, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8926792144775391, + "num_tokens": 406805011.0, + "step": 10659 + }, + { + "epoch": 1.356061569774838, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 3.778848648071289, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8599870204925537, + "num_tokens": 406841058.0, + "step": 10660 + }, + { + "epoch": 1.3561887800534282, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.560578465461731, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8745092153549194, + "num_tokens": 406884125.0, + "step": 10661 + }, + { + "epoch": 1.3563159903320188, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5503182411193848, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.874122142791748, + "num_tokens": 406929929.0, + "step": 10662 + }, + { + "epoch": 1.3564432006106093, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5446971654891968, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8653302192687988, + "num_tokens": 406971114.0, + "step": 10663 + }, + { + "epoch": 1.3565704108891998, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.612949013710022, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8622522354125977, + "num_tokens": 407006429.0, + "step": 10664 + }, + { + "epoch": 1.3566976211677904, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6033499240875244, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8713516592979431, + "num_tokens": 407048087.0, + "step": 10665 + }, + { + "epoch": 1.3568248314463809, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.4570183753967285, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8710501194000244, + "num_tokens": 407090599.0, + "step": 10666 + }, + { + "epoch": 1.3569520417249714, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5930702686309814, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8762985467910767, + "num_tokens": 407130671.0, + "step": 10667 + }, + { + "epoch": 1.357079252003562, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5481388568878174, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8800339698791504, + "num_tokens": 407167401.0, + "step": 10668 + }, + { + "epoch": 1.3572064622821525, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6917859315872192, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8638915419578552, + "num_tokens": 407201041.0, + "step": 10669 + }, + { + "epoch": 1.3573336725607428, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5764466524124146, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8821649551391602, + "num_tokens": 407238193.0, + "step": 10670 + }, + { + "epoch": 1.3574608828393333, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5771957635879517, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8735995292663574, + "num_tokens": 407276166.0, + "step": 10671 + }, + { + "epoch": 1.3575880931179238, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7203190326690674, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8800176382064819, + "num_tokens": 407310479.0, + "step": 10672 + }, + { + "epoch": 1.3577153033965144, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6257457733154297, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8764920234680176, + "num_tokens": 407343865.0, + "step": 10673 + }, + { + "epoch": 1.357842513675105, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6739624738693237, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8535555601119995, + "num_tokens": 407378087.0, + "step": 10674 + }, + { + "epoch": 1.3579697239536954, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5703781843185425, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8651592135429382, + "num_tokens": 407418558.0, + "step": 10675 + }, + { + "epoch": 1.358096934232286, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5432181358337402, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8725820779800415, + "num_tokens": 407460795.0, + "step": 10676 + }, + { + "epoch": 1.3582241445108765, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5874264240264893, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8645682334899902, + "num_tokens": 407501957.0, + "step": 10677 + }, + { + "epoch": 1.358351354789467, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6714783906936646, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8572412133216858, + "num_tokens": 407540823.0, + "step": 10678 + }, + { + "epoch": 1.3584785650680575, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6157211065292358, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8725529909133911, + "num_tokens": 407576216.0, + "step": 10679 + }, + { + "epoch": 1.358605775346648, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6266804933547974, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8641983270645142, + "num_tokens": 407616575.0, + "step": 10680 + }, + { + "epoch": 1.3587329856252386, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.8152663707733154, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8497546911239624, + "num_tokens": 407649582.0, + "step": 10681 + }, + { + "epoch": 1.3588601959038291, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7309788465499878, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8738651275634766, + "num_tokens": 407679842.0, + "step": 10682 + }, + { + "epoch": 1.3589874061824196, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5813255310058594, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8777028918266296, + "num_tokens": 407718827.0, + "step": 10683 + }, + { + "epoch": 1.3591146164610102, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6076236963272095, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8683405518531799, + "num_tokens": 407761682.0, + "step": 10684 + }, + { + "epoch": 1.3592418267396005, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6901519298553467, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8778320550918579, + "num_tokens": 407795897.0, + "step": 10685 + }, + { + "epoch": 1.359369037018191, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5521060228347778, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8791112899780273, + "num_tokens": 407834652.0, + "step": 10686 + }, + { + "epoch": 1.3594962472967815, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5363991260528564, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8589717149734497, + "num_tokens": 407876338.0, + "step": 10687 + }, + { + "epoch": 1.359623457575372, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6782974004745483, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8626203536987305, + "num_tokens": 407912120.0, + "step": 10688 + }, + { + "epoch": 1.3597506678539626, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6083221435546875, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8721458911895752, + "num_tokens": 407949649.0, + "step": 10689 + }, + { + "epoch": 1.3598778781325531, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5517497062683105, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.881381094455719, + "num_tokens": 407986685.0, + "step": 10690 + }, + { + "epoch": 1.3600050884111436, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5867376327514648, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8815853595733643, + "num_tokens": 408023512.0, + "step": 10691 + }, + { + "epoch": 1.3601322986897342, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5708516836166382, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8755457401275635, + "num_tokens": 408064844.0, + "step": 10692 + }, + { + "epoch": 1.3602595089683247, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.669039011001587, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8583475351333618, + "num_tokens": 408103216.0, + "step": 10693 + }, + { + "epoch": 1.3603867192469152, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.729474425315857, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8705097436904907, + "num_tokens": 408137235.0, + "step": 10694 + }, + { + "epoch": 1.3605139295255055, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7128084897994995, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8766453862190247, + "num_tokens": 408171308.0, + "step": 10695 + }, + { + "epoch": 1.360641139804096, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5097748041152954, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8683663606643677, + "num_tokens": 408215107.0, + "step": 10696 + }, + { + "epoch": 1.3607683500826866, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6415715217590332, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.881019115447998, + "num_tokens": 408250855.0, + "step": 10697 + }, + { + "epoch": 1.3608955603612771, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7400150299072266, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8753910064697266, + "num_tokens": 408287211.0, + "step": 10698 + }, + { + "epoch": 1.3610227706398677, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6396774053573608, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.869473397731781, + "num_tokens": 408323126.0, + "step": 10699 + }, + { + "epoch": 1.3611499809184582, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.8507169485092163, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8789721131324768, + "num_tokens": 408353389.0, + "step": 10700 + }, + { + "epoch": 1.3612771911970487, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7021875381469727, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8652079105377197, + "num_tokens": 408388632.0, + "step": 10701 + }, + { + "epoch": 1.3614044014756392, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.576302409172058, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8559131622314453, + "num_tokens": 408427359.0, + "step": 10702 + }, + { + "epoch": 1.3615316117542298, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.4776053428649902, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.871401309967041, + "num_tokens": 408471699.0, + "step": 10703 + }, + { + "epoch": 1.3616588220328203, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7454636096954346, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8500549793243408, + "num_tokens": 408507320.0, + "step": 10704 + }, + { + "epoch": 1.3617860323114108, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.701658844947815, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8767030239105225, + "num_tokens": 408542655.0, + "step": 10705 + }, + { + "epoch": 1.3619132425900013, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6521557569503784, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8751171827316284, + "num_tokens": 408581561.0, + "step": 10706 + }, + { + "epoch": 1.3620404528685919, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.629302740097046, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.887082576751709, + "num_tokens": 408616926.0, + "step": 10707 + }, + { + "epoch": 1.3621676631471824, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5327070951461792, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8709447979927063, + "num_tokens": 408656611.0, + "step": 10708 + }, + { + "epoch": 1.362294873425773, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.803110957145691, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8602017164230347, + "num_tokens": 408688439.0, + "step": 10709 + }, + { + "epoch": 1.3624220837043632, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5138736963272095, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8806978464126587, + "num_tokens": 408727271.0, + "step": 10710 + }, + { + "epoch": 1.3625492939829538, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6681228876113892, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8632319569587708, + "num_tokens": 408765586.0, + "step": 10711 + }, + { + "epoch": 1.3626765042615443, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6789087057113647, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8584655523300171, + "num_tokens": 408802368.0, + "step": 10712 + }, + { + "epoch": 1.3628037145401348, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6337918043136597, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8766911029815674, + "num_tokens": 408836999.0, + "step": 10713 + }, + { + "epoch": 1.3629309248187254, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5888824462890625, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8793023824691772, + "num_tokens": 408874331.0, + "step": 10714 + }, + { + "epoch": 1.3630581350973159, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7360252141952515, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8751956224441528, + "num_tokens": 408904506.0, + "step": 10715 + }, + { + "epoch": 1.3631853453759064, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6496241092681885, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8617156744003296, + "num_tokens": 408943649.0, + "step": 10716 + }, + { + "epoch": 1.363312555654497, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5281609296798706, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8658868670463562, + "num_tokens": 408984292.0, + "step": 10717 + }, + { + "epoch": 1.3634397659330875, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5632610321044922, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8694676756858826, + "num_tokens": 409020758.0, + "step": 10718 + }, + { + "epoch": 1.3635669762116778, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.612376093864441, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8526285886764526, + "num_tokens": 409061157.0, + "step": 10719 + }, + { + "epoch": 1.3636941864902683, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7643338441848755, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8719288110733032, + "num_tokens": 409096888.0, + "step": 10720 + }, + { + "epoch": 1.3638213967688588, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5133650302886963, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8741333484649658, + "num_tokens": 409135664.0, + "step": 10721 + }, + { + "epoch": 1.3639486070474494, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6202220916748047, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8762848377227783, + "num_tokens": 409170950.0, + "step": 10722 + }, + { + "epoch": 1.3640758173260399, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5896855592727661, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.872483491897583, + "num_tokens": 409207090.0, + "step": 10723 + }, + { + "epoch": 1.3642030276046304, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.728056788444519, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8657110929489136, + "num_tokens": 409243216.0, + "step": 10724 + }, + { + "epoch": 1.364330237883221, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7779810428619385, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.869053840637207, + "num_tokens": 409276252.0, + "step": 10725 + }, + { + "epoch": 1.3644574481618115, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6761308908462524, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8599163293838501, + "num_tokens": 409315404.0, + "step": 10726 + }, + { + "epoch": 1.364584658440402, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6190662384033203, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8629332780838013, + "num_tokens": 409354307.0, + "step": 10727 + }, + { + "epoch": 1.3647118687189925, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5560171604156494, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8819944858551025, + "num_tokens": 409391906.0, + "step": 10728 + }, + { + "epoch": 1.364839078997583, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5478936433792114, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8874604105949402, + "num_tokens": 409425511.0, + "step": 10729 + }, + { + "epoch": 1.3649662892761736, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6058207750320435, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8703815937042236, + "num_tokens": 409464093.0, + "step": 10730 + }, + { + "epoch": 1.3650934995547641, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5056602954864502, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8951572179794312, + "num_tokens": 409502744.0, + "step": 10731 + }, + { + "epoch": 1.3652207098333546, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.565386414527893, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8624697923660278, + "num_tokens": 409547406.0, + "step": 10732 + }, + { + "epoch": 1.3653479201119452, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5977225303649902, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8703985214233398, + "num_tokens": 409584487.0, + "step": 10733 + }, + { + "epoch": 1.3654751303905355, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6501901149749756, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8659729957580566, + "num_tokens": 409620478.0, + "step": 10734 + }, + { + "epoch": 1.365602340669126, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6229256391525269, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8677212595939636, + "num_tokens": 409660645.0, + "step": 10735 + }, + { + "epoch": 1.3657295509477165, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6148313283920288, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8722470998764038, + "num_tokens": 409696556.0, + "step": 10736 + }, + { + "epoch": 1.365856761226307, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.580674409866333, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8716989755630493, + "num_tokens": 409735288.0, + "step": 10737 + }, + { + "epoch": 1.3659839715048976, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5014890432357788, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8889424800872803, + "num_tokens": 409776151.0, + "step": 10738 + }, + { + "epoch": 1.3661111817834881, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4438965320587158, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.884575605392456, + "num_tokens": 409815032.0, + "step": 10739 + }, + { + "epoch": 1.3662383920620786, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.442126750946045, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8712673187255859, + "num_tokens": 409858870.0, + "step": 10740 + }, + { + "epoch": 1.3663656023406692, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5189615488052368, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8668885231018066, + "num_tokens": 409899563.0, + "step": 10741 + }, + { + "epoch": 1.3664928126192597, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.4429842233657837, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8742843270301819, + "num_tokens": 409943752.0, + "step": 10742 + }, + { + "epoch": 1.3666200228978502, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.5276974439620972, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8679913878440857, + "num_tokens": 409984888.0, + "step": 10743 + }, + { + "epoch": 1.3667472331764405, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6026116609573364, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.891438901424408, + "num_tokens": 410019635.0, + "step": 10744 + }, + { + "epoch": 1.366874443455031, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.6119587421417236, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8865071535110474, + "num_tokens": 410053570.0, + "step": 10745 + }, + { + "epoch": 1.3670016537336216, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4463297128677368, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.884041428565979, + "num_tokens": 410092519.0, + "step": 10746 + }, + { + "epoch": 1.3671288640122121, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5613268613815308, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8848361968994141, + "num_tokens": 410128729.0, + "step": 10747 + }, + { + "epoch": 1.3672560742908026, + "ewc_loss": 2.276897430419922e-05, + "grad_norm": 1.7998950481414795, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8623451590538025, + "num_tokens": 410168601.0, + "step": 10748 + }, + { + "epoch": 1.3673832845693932, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5236129760742188, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8707143068313599, + "num_tokens": 410210726.0, + "step": 10749 + }, + { + "epoch": 1.3675104948479837, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.8291226625442505, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8540520668029785, + "num_tokens": 410244626.0, + "step": 10750 + }, + { + "epoch": 1.3676377051265742, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.715088129043579, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.866615891456604, + "num_tokens": 410282594.0, + "step": 10751 + }, + { + "epoch": 1.3677649154051648, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5741779804229736, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8798791170120239, + "num_tokens": 410319333.0, + "step": 10752 + }, + { + "epoch": 1.3678921256837553, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.70004403591156, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8772592544555664, + "num_tokens": 410353189.0, + "step": 10753 + }, + { + "epoch": 1.3680193359623458, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6200451850891113, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8731366395950317, + "num_tokens": 410390247.0, + "step": 10754 + }, + { + "epoch": 1.3681465462409363, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5358270406723022, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8867080211639404, + "num_tokens": 410430445.0, + "step": 10755 + }, + { + "epoch": 1.3682737565195269, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.670770287513733, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8671057224273682, + "num_tokens": 410466024.0, + "step": 10756 + }, + { + "epoch": 1.3684009667981174, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5110267400741577, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8664813041687012, + "num_tokens": 410513917.0, + "step": 10757 + }, + { + "epoch": 1.368528177076708, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6710338592529297, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8769239783287048, + "num_tokens": 410546768.0, + "step": 10758 + }, + { + "epoch": 1.3686553873552982, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6205506324768066, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8748686909675598, + "num_tokens": 410580593.0, + "step": 10759 + }, + { + "epoch": 1.3687825976338888, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6813271045684814, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8649800419807434, + "num_tokens": 410616522.0, + "step": 10760 + }, + { + "epoch": 1.3689098079124793, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5531858205795288, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8605276346206665, + "num_tokens": 410659904.0, + "step": 10761 + }, + { + "epoch": 1.3690370181910698, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5672708749771118, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8727311491966248, + "num_tokens": 410699270.0, + "step": 10762 + }, + { + "epoch": 1.3691642284696603, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.625546932220459, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8746037483215332, + "num_tokens": 410735080.0, + "step": 10763 + }, + { + "epoch": 1.3692914387482509, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6388905048370361, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8755133152008057, + "num_tokens": 410773931.0, + "step": 10764 + }, + { + "epoch": 1.3694186490268414, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6712002754211426, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8669378757476807, + "num_tokens": 410809512.0, + "step": 10765 + }, + { + "epoch": 1.369545859305432, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7551566362380981, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8537662625312805, + "num_tokens": 410846453.0, + "step": 10766 + }, + { + "epoch": 1.3696730695840225, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.655519723892212, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8623619675636292, + "num_tokens": 410883365.0, + "step": 10767 + }, + { + "epoch": 1.3698002798626128, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5168488025665283, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8694981932640076, + "num_tokens": 410923201.0, + "step": 10768 + }, + { + "epoch": 1.3699274901412033, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6056157350540161, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8746953010559082, + "num_tokens": 410962866.0, + "step": 10769 + }, + { + "epoch": 1.3700547004197938, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.657547116279602, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8574836254119873, + "num_tokens": 411004647.0, + "step": 10770 + }, + { + "epoch": 1.3701819106983844, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.654390573501587, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.874725341796875, + "num_tokens": 411041976.0, + "step": 10771 + }, + { + "epoch": 1.3703091209769749, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6026902198791504, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.876788318157196, + "num_tokens": 411076683.0, + "step": 10772 + }, + { + "epoch": 1.3704363312555654, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6091680526733398, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.868220329284668, + "num_tokens": 411112431.0, + "step": 10773 + }, + { + "epoch": 1.370563541534156, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7389308214187622, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.856531023979187, + "num_tokens": 411146533.0, + "step": 10774 + }, + { + "epoch": 1.3706907518127465, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6676514148712158, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8613437414169312, + "num_tokens": 411183109.0, + "step": 10775 + }, + { + "epoch": 1.370817962091337, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.569061517715454, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8726387023925781, + "num_tokens": 411219468.0, + "step": 10776 + }, + { + "epoch": 1.3709451723699275, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6086267232894897, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8668535351753235, + "num_tokens": 411259005.0, + "step": 10777 + }, + { + "epoch": 1.371072382648518, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.56101655960083, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.872002124786377, + "num_tokens": 411297145.0, + "step": 10778 + }, + { + "epoch": 1.3711995929271086, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5375016927719116, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8557196259498596, + "num_tokens": 411343468.0, + "step": 10779 + }, + { + "epoch": 1.371326803205699, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5667275190353394, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.871321439743042, + "num_tokens": 411382785.0, + "step": 10780 + }, + { + "epoch": 1.3714540134842896, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5331189632415771, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8677691221237183, + "num_tokens": 411425750.0, + "step": 10781 + }, + { + "epoch": 1.3715812237628802, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6321344375610352, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8784310221672058, + "num_tokens": 411461659.0, + "step": 10782 + }, + { + "epoch": 1.3717084340414705, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6625593900680542, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8487703800201416, + "num_tokens": 411502147.0, + "step": 10783 + }, + { + "epoch": 1.371835644320061, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5857306718826294, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8746793270111084, + "num_tokens": 411539535.0, + "step": 10784 + }, + { + "epoch": 1.3719628545986515, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7256686687469482, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8639804124832153, + "num_tokens": 411576439.0, + "step": 10785 + }, + { + "epoch": 1.372090064877242, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6368159055709839, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8846101760864258, + "num_tokens": 411613701.0, + "step": 10786 + }, + { + "epoch": 1.3722172751558326, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.562029242515564, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8552846908569336, + "num_tokens": 411657372.0, + "step": 10787 + }, + { + "epoch": 1.3723444854344231, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6547287702560425, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.883484959602356, + "num_tokens": 411697589.0, + "step": 10788 + }, + { + "epoch": 1.3724716957130136, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7536518573760986, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8532543778419495, + "num_tokens": 411739257.0, + "step": 10789 + }, + { + "epoch": 1.3725989059916042, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.635710597038269, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8723320960998535, + "num_tokens": 411774242.0, + "step": 10790 + }, + { + "epoch": 1.3727261162701947, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6044471263885498, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8857904076576233, + "num_tokens": 411809360.0, + "step": 10791 + }, + { + "epoch": 1.3728533265487852, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.69173002243042, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8544279336929321, + "num_tokens": 411848405.0, + "step": 10792 + }, + { + "epoch": 1.3729805368273755, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7123057842254639, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8728581070899963, + "num_tokens": 411885232.0, + "step": 10793 + }, + { + "epoch": 1.373107747105966, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.812334656715393, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8601082563400269, + "num_tokens": 411922682.0, + "step": 10794 + }, + { + "epoch": 1.3732349573845566, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7124234437942505, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8775703310966492, + "num_tokens": 411956658.0, + "step": 10795 + }, + { + "epoch": 1.3733621676631471, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7080727815628052, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8675268292427063, + "num_tokens": 411996454.0, + "step": 10796 + }, + { + "epoch": 1.3734893779417376, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5576316118240356, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8642143607139587, + "num_tokens": 412039076.0, + "step": 10797 + }, + { + "epoch": 1.3736165882203282, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4475641250610352, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8755381107330322, + "num_tokens": 412084999.0, + "step": 10798 + }, + { + "epoch": 1.3737437984989187, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5472290515899658, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8673782348632812, + "num_tokens": 412127020.0, + "step": 10799 + }, + { + "epoch": 1.3738710087775092, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6190625429153442, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8704556226730347, + "num_tokens": 412164551.0, + "step": 10800 + }, + { + "epoch": 1.3739982190560998, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6950607299804688, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8872342109680176, + "num_tokens": 412198854.0, + "step": 10801 + }, + { + "epoch": 1.3741254293346903, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4820635318756104, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8846027255058289, + "num_tokens": 412236209.0, + "step": 10802 + }, + { + "epoch": 1.3742526396132808, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.594687819480896, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8638381361961365, + "num_tokens": 412275026.0, + "step": 10803 + }, + { + "epoch": 1.3743798498918713, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.674089789390564, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8698337078094482, + "num_tokens": 412310989.0, + "step": 10804 + }, + { + "epoch": 1.3745070601704619, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7290273904800415, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8597356677055359, + "num_tokens": 412345023.0, + "step": 10805 + }, + { + "epoch": 1.3746342704490524, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7739814519882202, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8659327030181885, + "num_tokens": 412378578.0, + "step": 10806 + }, + { + "epoch": 1.374761480727643, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6839210987091064, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8709307909011841, + "num_tokens": 412417278.0, + "step": 10807 + }, + { + "epoch": 1.3748886910062332, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5940769910812378, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8801454305648804, + "num_tokens": 412455586.0, + "step": 10808 + }, + { + "epoch": 1.3750159012848238, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.687997579574585, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8586613535881042, + "num_tokens": 412493549.0, + "step": 10809 + }, + { + "epoch": 1.3751431115634143, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.625271201133728, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.865707278251648, + "num_tokens": 412531176.0, + "step": 10810 + }, + { + "epoch": 1.3752703218420048, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6937575340270996, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8446481823921204, + "num_tokens": 412575295.0, + "step": 10811 + }, + { + "epoch": 1.3753975321205953, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6406749486923218, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8700000047683716, + "num_tokens": 412613329.0, + "step": 10812 + }, + { + "epoch": 1.3755247423991859, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6017272472381592, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.881369411945343, + "num_tokens": 412646978.0, + "step": 10813 + }, + { + "epoch": 1.3756519526777764, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6518635749816895, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8635932207107544, + "num_tokens": 412685210.0, + "step": 10814 + }, + { + "epoch": 1.375779162956367, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6565299034118652, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8775303363800049, + "num_tokens": 412720224.0, + "step": 10815 + }, + { + "epoch": 1.3759063732349575, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.513229489326477, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8803489208221436, + "num_tokens": 412759932.0, + "step": 10816 + }, + { + "epoch": 1.3760335835135478, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7703922986984253, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8390110731124878, + "num_tokens": 412795619.0, + "step": 10817 + }, + { + "epoch": 1.3761607937921383, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5291486978530884, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8841688632965088, + "num_tokens": 412833630.0, + "step": 10818 + }, + { + "epoch": 1.3762880040707288, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5021787881851196, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8705216646194458, + "num_tokens": 412874563.0, + "step": 10819 + }, + { + "epoch": 1.3764152143493193, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.488228678703308, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8812317848205566, + "num_tokens": 412914913.0, + "step": 10820 + }, + { + "epoch": 1.3765424246279099, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5625698566436768, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8646467924118042, + "num_tokens": 412957185.0, + "step": 10821 + }, + { + "epoch": 1.3766696349065004, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7424284219741821, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8697311282157898, + "num_tokens": 412994363.0, + "step": 10822 + }, + { + "epoch": 1.376796845185091, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6025097370147705, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8465080261230469, + "num_tokens": 413034288.0, + "step": 10823 + }, + { + "epoch": 1.3769240554636815, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5822490453720093, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8789233565330505, + "num_tokens": 413076258.0, + "step": 10824 + }, + { + "epoch": 1.377051265742272, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6034754514694214, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8748540878295898, + "num_tokens": 413112852.0, + "step": 10825 + }, + { + "epoch": 1.3771784760208625, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5782023668289185, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8718600273132324, + "num_tokens": 413155513.0, + "step": 10826 + }, + { + "epoch": 1.377305686299453, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5792856216430664, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8636766672134399, + "num_tokens": 413201962.0, + "step": 10827 + }, + { + "epoch": 1.3774328965780436, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6990960836410522, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8791549801826477, + "num_tokens": 413240953.0, + "step": 10828 + }, + { + "epoch": 1.377560106856634, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5586491823196411, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.873505711555481, + "num_tokens": 413282103.0, + "step": 10829 + }, + { + "epoch": 1.3776873171352246, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5786782503128052, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.864655613899231, + "num_tokens": 413321851.0, + "step": 10830 + }, + { + "epoch": 1.3778145274138152, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6495379209518433, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8602191209793091, + "num_tokens": 413363058.0, + "step": 10831 + }, + { + "epoch": 1.3779417376924055, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6682997941970825, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8667720556259155, + "num_tokens": 413405090.0, + "step": 10832 + }, + { + "epoch": 1.378068947970996, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6858892440795898, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8676685690879822, + "num_tokens": 413443639.0, + "step": 10833 + }, + { + "epoch": 1.3781961582495865, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4988471269607544, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8731893301010132, + "num_tokens": 413486223.0, + "step": 10834 + }, + { + "epoch": 1.378323368528177, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.61053466796875, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8642269372940063, + "num_tokens": 413523769.0, + "step": 10835 + }, + { + "epoch": 1.3784505788067676, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.494460105895996, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.867620050907135, + "num_tokens": 413567672.0, + "step": 10836 + }, + { + "epoch": 1.378577789085358, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.718855619430542, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8693770170211792, + "num_tokens": 413599397.0, + "step": 10837 + }, + { + "epoch": 1.3787049993639486, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5394213199615479, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8707272410392761, + "num_tokens": 413642279.0, + "step": 10838 + }, + { + "epoch": 1.3788322096425392, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.8962825536727905, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8663569688796997, + "num_tokens": 413674646.0, + "step": 10839 + }, + { + "epoch": 1.3789594199211297, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7774536609649658, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8674847483634949, + "num_tokens": 413706605.0, + "step": 10840 + }, + { + "epoch": 1.3790866301997202, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5845485925674438, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8679728507995605, + "num_tokens": 413744999.0, + "step": 10841 + }, + { + "epoch": 1.3792138404783105, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5870670080184937, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8531128168106079, + "num_tokens": 413788857.0, + "step": 10842 + }, + { + "epoch": 1.379341050756901, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5343892574310303, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8755449652671814, + "num_tokens": 413831270.0, + "step": 10843 + }, + { + "epoch": 1.3794682610354916, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6441235542297363, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8620203733444214, + "num_tokens": 413877229.0, + "step": 10844 + }, + { + "epoch": 1.3795954713140821, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7449195384979248, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8755061626434326, + "num_tokens": 413914778.0, + "step": 10845 + }, + { + "epoch": 1.3797226815926726, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5981745719909668, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8660656213760376, + "num_tokens": 413955376.0, + "step": 10846 + }, + { + "epoch": 1.3798498918712632, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.554215431213379, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8725717663764954, + "num_tokens": 413997360.0, + "step": 10847 + }, + { + "epoch": 1.3799771021498537, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5753774642944336, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8907253742218018, + "num_tokens": 414039079.0, + "step": 10848 + }, + { + "epoch": 1.3801043124284442, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6528371572494507, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8659477233886719, + "num_tokens": 414074856.0, + "step": 10849 + }, + { + "epoch": 1.3802315227070348, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5037862062454224, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8787412047386169, + "num_tokens": 414115366.0, + "step": 10850 + }, + { + "epoch": 1.3803587329856253, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5286306142807007, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8722673058509827, + "num_tokens": 414156737.0, + "step": 10851 + }, + { + "epoch": 1.3804859432642158, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5452840328216553, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.870002269744873, + "num_tokens": 414194103.0, + "step": 10852 + }, + { + "epoch": 1.3806131535428063, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6356773376464844, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8753954768180847, + "num_tokens": 414231409.0, + "step": 10853 + }, + { + "epoch": 1.3807403638213969, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6781996488571167, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8767461180686951, + "num_tokens": 414267918.0, + "step": 10854 + }, + { + "epoch": 1.3808675740999874, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5735708475112915, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8533980846405029, + "num_tokens": 414308571.0, + "step": 10855 + }, + { + "epoch": 1.380994784378578, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6520497798919678, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.880303680896759, + "num_tokens": 414341748.0, + "step": 10856 + }, + { + "epoch": 1.3811219946571682, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 2.1385865211486816, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.876521110534668, + "num_tokens": 414383402.0, + "step": 10857 + }, + { + "epoch": 1.3812492049357588, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6218780279159546, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8651158213615417, + "num_tokens": 414422753.0, + "step": 10858 + }, + { + "epoch": 1.3813764152143493, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5366740226745605, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8761304616928101, + "num_tokens": 414461167.0, + "step": 10859 + }, + { + "epoch": 1.3815036254929398, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.551918387413025, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.867666482925415, + "num_tokens": 414499564.0, + "step": 10860 + }, + { + "epoch": 1.3816308357715303, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5402523279190063, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8653834462165833, + "num_tokens": 414540449.0, + "step": 10861 + }, + { + "epoch": 1.3817580460501209, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.669992208480835, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8815770149230957, + "num_tokens": 414578993.0, + "step": 10862 + }, + { + "epoch": 1.3818852563287114, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.638562798500061, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8585140705108643, + "num_tokens": 414619417.0, + "step": 10863 + }, + { + "epoch": 1.382012466607302, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5604212284088135, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8651344776153564, + "num_tokens": 414657356.0, + "step": 10864 + }, + { + "epoch": 1.3821396768858925, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.509169340133667, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8873789310455322, + "num_tokens": 414697247.0, + "step": 10865 + }, + { + "epoch": 1.3822668871644828, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4765268564224243, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.875539243221283, + "num_tokens": 414739291.0, + "step": 10866 + }, + { + "epoch": 1.3823940974430733, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5475915670394897, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8676823973655701, + "num_tokens": 414778701.0, + "step": 10867 + }, + { + "epoch": 1.3825213077216638, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 2.2267208099365234, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8641043901443481, + "num_tokens": 414819595.0, + "step": 10868 + }, + { + "epoch": 1.3826485180002543, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6430248022079468, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8613429069519043, + "num_tokens": 414859726.0, + "step": 10869 + }, + { + "epoch": 1.3827757282788449, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5232372283935547, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8724006414413452, + "num_tokens": 414903715.0, + "step": 10870 + }, + { + "epoch": 1.3829029385574354, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.635823130607605, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8769943714141846, + "num_tokens": 414938229.0, + "step": 10871 + }, + { + "epoch": 1.383030148836026, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5901330709457397, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.863552451133728, + "num_tokens": 414974555.0, + "step": 10872 + }, + { + "epoch": 1.3831573591146165, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.610162377357483, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8737912178039551, + "num_tokens": 415010087.0, + "step": 10873 + }, + { + "epoch": 1.383284569393207, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6912671327590942, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.873867392539978, + "num_tokens": 415040398.0, + "step": 10874 + }, + { + "epoch": 1.3834117796717975, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5957531929016113, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8861243724822998, + "num_tokens": 415074771.0, + "step": 10875 + }, + { + "epoch": 1.383538989950388, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6116504669189453, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8760534524917603, + "num_tokens": 415114929.0, + "step": 10876 + }, + { + "epoch": 1.3836662002289786, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7009297609329224, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8501383066177368, + "num_tokens": 415151966.0, + "step": 10877 + }, + { + "epoch": 1.383793410507569, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6060417890548706, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8699215650558472, + "num_tokens": 415192034.0, + "step": 10878 + }, + { + "epoch": 1.3839206207861596, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6511032581329346, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8647990226745605, + "num_tokens": 415229342.0, + "step": 10879 + }, + { + "epoch": 1.3840478310647502, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6104141473770142, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8787498474121094, + "num_tokens": 415267008.0, + "step": 10880 + }, + { + "epoch": 1.3841750413433405, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5586440563201904, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8718037605285645, + "num_tokens": 415307450.0, + "step": 10881 + }, + { + "epoch": 1.384302251621931, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4950836896896362, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8728244304656982, + "num_tokens": 415348253.0, + "step": 10882 + }, + { + "epoch": 1.3844294619005215, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.5457791090011597, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8828118443489075, + "num_tokens": 415385281.0, + "step": 10883 + }, + { + "epoch": 1.384556672179112, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4630855321884155, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.882235050201416, + "num_tokens": 415428564.0, + "step": 10884 + }, + { + "epoch": 1.3846838824577026, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6392822265625, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8765388131141663, + "num_tokens": 415467067.0, + "step": 10885 + }, + { + "epoch": 1.384811092736293, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7644940614700317, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8804316520690918, + "num_tokens": 415501557.0, + "step": 10886 + }, + { + "epoch": 1.3849383030148836, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.6256533861160278, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8581029176712036, + "num_tokens": 415541751.0, + "step": 10887 + }, + { + "epoch": 1.3850655132934742, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.7000041007995605, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8844560384750366, + "num_tokens": 415578117.0, + "step": 10888 + }, + { + "epoch": 1.3851927235720647, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5656408071517944, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8802100419998169, + "num_tokens": 415615534.0, + "step": 10889 + }, + { + "epoch": 1.385319933850655, + "ewc_loss": 2.288818359375e-05, + "grad_norm": 1.4805634021759033, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8809094429016113, + "num_tokens": 415657001.0, + "step": 10890 + }, + { + "epoch": 1.3854471441292455, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6355781555175781, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8808031678199768, + "num_tokens": 415691903.0, + "step": 10891 + }, + { + "epoch": 1.385574354407836, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6595178842544556, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.861568808555603, + "num_tokens": 415730161.0, + "step": 10892 + }, + { + "epoch": 1.3857015646864266, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6307801008224487, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8759955167770386, + "num_tokens": 415765406.0, + "step": 10893 + }, + { + "epoch": 1.385828774965017, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.8752906322479248, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8624393939971924, + "num_tokens": 415801447.0, + "step": 10894 + }, + { + "epoch": 1.3859559852436076, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7698841094970703, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8497251868247986, + "num_tokens": 415841195.0, + "step": 10895 + }, + { + "epoch": 1.3860831955221982, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6699044704437256, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8530100584030151, + "num_tokens": 415881263.0, + "step": 10896 + }, + { + "epoch": 1.3862104058007887, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7082003355026245, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8637562990188599, + "num_tokens": 415920965.0, + "step": 10897 + }, + { + "epoch": 1.3863376160793792, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.709466814994812, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8555216789245605, + "num_tokens": 415960607.0, + "step": 10898 + }, + { + "epoch": 1.3864648263579697, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6395090818405151, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8573770523071289, + "num_tokens": 415997325.0, + "step": 10899 + }, + { + "epoch": 1.3865920366365603, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.69003427028656, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8726161122322083, + "num_tokens": 416029312.0, + "step": 10900 + }, + { + "epoch": 1.3867192469151508, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7641868591308594, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8780635595321655, + "num_tokens": 416070361.0, + "step": 10901 + }, + { + "epoch": 1.3868464571937413, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6011407375335693, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8786554336547852, + "num_tokens": 416110341.0, + "step": 10902 + }, + { + "epoch": 1.3869736674723319, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.843249797821045, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8909333944320679, + "num_tokens": 416141537.0, + "step": 10903 + }, + { + "epoch": 1.3871008777509224, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.729408621788025, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8568448424339294, + "num_tokens": 416181144.0, + "step": 10904 + }, + { + "epoch": 1.387228088029513, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.659432053565979, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8618100881576538, + "num_tokens": 416223150.0, + "step": 10905 + }, + { + "epoch": 1.3873552983081032, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6494038105010986, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8658506274223328, + "num_tokens": 416261871.0, + "step": 10906 + }, + { + "epoch": 1.3874825085866938, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6046793460845947, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8731783032417297, + "num_tokens": 416298376.0, + "step": 10907 + }, + { + "epoch": 1.3876097188652843, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6095081567764282, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8636529445648193, + "num_tokens": 416341096.0, + "step": 10908 + }, + { + "epoch": 1.3877369291438748, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7392370700836182, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8735101222991943, + "num_tokens": 416375950.0, + "step": 10909 + }, + { + "epoch": 1.3878641394224653, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7417867183685303, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8795461654663086, + "num_tokens": 416409255.0, + "step": 10910 + }, + { + "epoch": 1.3879913497010559, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6582280397415161, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.873713493347168, + "num_tokens": 416445510.0, + "step": 10911 + }, + { + "epoch": 1.3881185599796464, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6474722623825073, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8616564273834229, + "num_tokens": 416483253.0, + "step": 10912 + }, + { + "epoch": 1.388245770258237, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.4804019927978516, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.887002170085907, + "num_tokens": 416523520.0, + "step": 10913 + }, + { + "epoch": 1.3883729805368275, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6023621559143066, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8630660772323608, + "num_tokens": 416561905.0, + "step": 10914 + }, + { + "epoch": 1.3885001908154178, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5876004695892334, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8709034323692322, + "num_tokens": 416600236.0, + "step": 10915 + }, + { + "epoch": 1.3886274010940083, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.699629783630371, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8710761070251465, + "num_tokens": 416633097.0, + "step": 10916 + }, + { + "epoch": 1.3887546113725988, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7164121866226196, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8734052181243896, + "num_tokens": 416669850.0, + "step": 10917 + }, + { + "epoch": 1.3888818216511893, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.8246337175369263, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8642178773880005, + "num_tokens": 416706060.0, + "step": 10918 + }, + { + "epoch": 1.3890090319297799, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5869895219802856, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8635027408599854, + "num_tokens": 416747710.0, + "step": 10919 + }, + { + "epoch": 1.3891362422083704, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6530896425247192, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8657970428466797, + "num_tokens": 416783352.0, + "step": 10920 + }, + { + "epoch": 1.389263452486961, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5838873386383057, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.862749457359314, + "num_tokens": 416824908.0, + "step": 10921 + }, + { + "epoch": 1.3893906627655515, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5076419115066528, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8781081438064575, + "num_tokens": 416867116.0, + "step": 10922 + }, + { + "epoch": 1.389517873044142, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5430437326431274, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8720346689224243, + "num_tokens": 416908352.0, + "step": 10923 + }, + { + "epoch": 1.3896450833227325, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.741482138633728, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8422132730484009, + "num_tokens": 416947717.0, + "step": 10924 + }, + { + "epoch": 1.389772293601323, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7325196266174316, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8749381303787231, + "num_tokens": 416981063.0, + "step": 10925 + }, + { + "epoch": 1.3898995038799136, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7732036113739014, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8692806959152222, + "num_tokens": 417017985.0, + "step": 10926 + }, + { + "epoch": 1.390026714158504, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6804237365722656, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8729933500289917, + "num_tokens": 417049879.0, + "step": 10927 + }, + { + "epoch": 1.3901539244370946, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6743223667144775, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8680903315544128, + "num_tokens": 417087725.0, + "step": 10928 + }, + { + "epoch": 1.3902811347156852, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6717844009399414, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8698551654815674, + "num_tokens": 417126152.0, + "step": 10929 + }, + { + "epoch": 1.3904083449942755, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.711503505706787, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8609243631362915, + "num_tokens": 417165469.0, + "step": 10930 + }, + { + "epoch": 1.390535555272866, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.536041259765625, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8713938593864441, + "num_tokens": 417204454.0, + "step": 10931 + }, + { + "epoch": 1.3906627655514565, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6631009578704834, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8445136547088623, + "num_tokens": 417245747.0, + "step": 10932 + }, + { + "epoch": 1.390789975830047, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6389552354812622, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8771113157272339, + "num_tokens": 417281969.0, + "step": 10933 + }, + { + "epoch": 1.3909171861086376, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6655216217041016, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8684071898460388, + "num_tokens": 417320577.0, + "step": 10934 + }, + { + "epoch": 1.391044396387228, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.731621265411377, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.849756121635437, + "num_tokens": 417359088.0, + "step": 10935 + }, + { + "epoch": 1.3911716066658186, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7069519758224487, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8893298506736755, + "num_tokens": 417393535.0, + "step": 10936 + }, + { + "epoch": 1.3912988169444092, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5781725645065308, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8641821146011353, + "num_tokens": 417432610.0, + "step": 10937 + }, + { + "epoch": 1.3914260272229997, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6942249536514282, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8605806827545166, + "num_tokens": 417475207.0, + "step": 10938 + }, + { + "epoch": 1.39155323750159, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.603010892868042, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8607134222984314, + "num_tokens": 417516515.0, + "step": 10939 + }, + { + "epoch": 1.3916804477801805, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6875978708267212, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8881036639213562, + "num_tokens": 417554989.0, + "step": 10940 + }, + { + "epoch": 1.391807658058771, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.8951539993286133, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.856670618057251, + "num_tokens": 417591137.0, + "step": 10941 + }, + { + "epoch": 1.3919348683373616, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6501476764678955, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8628627061843872, + "num_tokens": 417626692.0, + "step": 10942 + }, + { + "epoch": 1.392062078615952, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.537665843963623, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.889682412147522, + "num_tokens": 417663424.0, + "step": 10943 + }, + { + "epoch": 1.3921892888945426, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6379032135009766, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.865970253944397, + "num_tokens": 417700302.0, + "step": 10944 + }, + { + "epoch": 1.3923164991731332, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5747244358062744, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8833245635032654, + "num_tokens": 417735221.0, + "step": 10945 + }, + { + "epoch": 1.3924437094517237, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7099032402038574, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8723022937774658, + "num_tokens": 417769629.0, + "step": 10946 + }, + { + "epoch": 1.3925709197303142, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5472288131713867, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8676329255104065, + "num_tokens": 417810071.0, + "step": 10947 + }, + { + "epoch": 1.3926981300089047, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6022305488586426, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8815919160842896, + "num_tokens": 417849111.0, + "step": 10948 + }, + { + "epoch": 1.3928253402874953, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 2.050863742828369, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8734501600265503, + "num_tokens": 417883795.0, + "step": 10949 + }, + { + "epoch": 1.3929525505660858, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6624641418457031, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8852745294570923, + "num_tokens": 417918976.0, + "step": 10950 + }, + { + "epoch": 1.3930797608446763, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7086201906204224, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8689692616462708, + "num_tokens": 417955591.0, + "step": 10951 + }, + { + "epoch": 1.3932069711232669, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7012262344360352, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8540289998054504, + "num_tokens": 417995149.0, + "step": 10952 + }, + { + "epoch": 1.3933341814018574, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5464690923690796, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8902907967567444, + "num_tokens": 418035954.0, + "step": 10953 + }, + { + "epoch": 1.393461391680448, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.631760597229004, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8679046630859375, + "num_tokens": 418072152.0, + "step": 10954 + }, + { + "epoch": 1.3935886019590382, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.817409634590149, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8620148301124573, + "num_tokens": 418108165.0, + "step": 10955 + }, + { + "epoch": 1.3937158122376287, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5929830074310303, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8727498650550842, + "num_tokens": 418148032.0, + "step": 10956 + }, + { + "epoch": 1.3938430225162193, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5643463134765625, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8701115846633911, + "num_tokens": 418188708.0, + "step": 10957 + }, + { + "epoch": 1.3939702327948098, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5543692111968994, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8669345378875732, + "num_tokens": 418232563.0, + "step": 10958 + }, + { + "epoch": 1.3940974430734003, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7935513257980347, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8763989210128784, + "num_tokens": 418266588.0, + "step": 10959 + }, + { + "epoch": 1.3942246533519909, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5573991537094116, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8633025288581848, + "num_tokens": 418307920.0, + "step": 10960 + }, + { + "epoch": 1.3943518636305814, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6888983249664307, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8743504285812378, + "num_tokens": 418341122.0, + "step": 10961 + }, + { + "epoch": 1.394479073909172, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5361263751983643, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8646075129508972, + "num_tokens": 418380676.0, + "step": 10962 + }, + { + "epoch": 1.3946062841877624, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.520986795425415, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8748787641525269, + "num_tokens": 418421587.0, + "step": 10963 + }, + { + "epoch": 1.3947334944663528, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5018306970596313, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8755216598510742, + "num_tokens": 418464010.0, + "step": 10964 + }, + { + "epoch": 1.3948607047449433, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6431052684783936, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8729637861251831, + "num_tokens": 418504448.0, + "step": 10965 + }, + { + "epoch": 1.3949879150235338, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.3704193830490112, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8853840827941895, + "num_tokens": 418549344.0, + "step": 10966 + }, + { + "epoch": 1.3951151253021243, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5275436639785767, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8772709965705872, + "num_tokens": 418588505.0, + "step": 10967 + }, + { + "epoch": 1.3952423355807149, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6327344179153442, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.85283362865448, + "num_tokens": 418628671.0, + "step": 10968 + }, + { + "epoch": 1.3953695458593054, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.624845027923584, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8816485404968262, + "num_tokens": 418665475.0, + "step": 10969 + }, + { + "epoch": 1.395496756137896, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6590839624404907, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8683434128761292, + "num_tokens": 418700159.0, + "step": 10970 + }, + { + "epoch": 1.3956239664164865, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.4890695810317993, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8660050630569458, + "num_tokens": 418744819.0, + "step": 10971 + }, + { + "epoch": 1.395751176695077, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5446399450302124, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8657163381576538, + "num_tokens": 418785992.0, + "step": 10972 + }, + { + "epoch": 1.3958783869736675, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.511344075202942, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8759557008743286, + "num_tokens": 418826905.0, + "step": 10973 + }, + { + "epoch": 1.396005597252258, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.815299391746521, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.863350510597229, + "num_tokens": 418858726.0, + "step": 10974 + }, + { + "epoch": 1.3961328075308486, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6269207000732422, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8700456023216248, + "num_tokens": 418893863.0, + "step": 10975 + }, + { + "epoch": 1.396260017809439, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.8722162246704102, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8644132614135742, + "num_tokens": 418926545.0, + "step": 10976 + }, + { + "epoch": 1.3963872280880296, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6588801145553589, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8802922964096069, + "num_tokens": 418961364.0, + "step": 10977 + }, + { + "epoch": 1.3965144383666201, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6854511499404907, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8728070259094238, + "num_tokens": 418997131.0, + "step": 10978 + }, + { + "epoch": 1.3966416486452105, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6223973035812378, + "learning_rate": 1e-06, + "loss": 0.3122, + "mean_token_accuracy": 0.8960273265838623, + "num_tokens": 419029215.0, + "step": 10979 + }, + { + "epoch": 1.396768858923801, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6641302108764648, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.884347677230835, + "num_tokens": 419068199.0, + "step": 10980 + }, + { + "epoch": 1.3968960692023915, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7327375411987305, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8546113967895508, + "num_tokens": 419109529.0, + "step": 10981 + }, + { + "epoch": 1.397023279480982, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.436822772026062, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8854395151138306, + "num_tokens": 419146534.0, + "step": 10982 + }, + { + "epoch": 1.3971504897595726, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6513786315917969, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8692300915718079, + "num_tokens": 419180680.0, + "step": 10983 + }, + { + "epoch": 1.397277700038163, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6534959077835083, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8703254461288452, + "num_tokens": 419222482.0, + "step": 10984 + }, + { + "epoch": 1.3974049103167536, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.4951421022415161, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8677399158477783, + "num_tokens": 419265165.0, + "step": 10985 + }, + { + "epoch": 1.3975321205953442, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7091403007507324, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.862026035785675, + "num_tokens": 419305817.0, + "step": 10986 + }, + { + "epoch": 1.3976593308739347, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7613834142684937, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8681331872940063, + "num_tokens": 419340228.0, + "step": 10987 + }, + { + "epoch": 1.397786541152525, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5142834186553955, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8938789367675781, + "num_tokens": 419376658.0, + "step": 10988 + }, + { + "epoch": 1.3979137514311155, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5136306285858154, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8760912418365479, + "num_tokens": 419420421.0, + "step": 10989 + }, + { + "epoch": 1.398040961709706, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5928666591644287, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8759282827377319, + "num_tokens": 419461237.0, + "step": 10990 + }, + { + "epoch": 1.3981681719882966, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6312212944030762, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8895002603530884, + "num_tokens": 419493188.0, + "step": 10991 + }, + { + "epoch": 1.398295382266887, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6123385429382324, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.855192244052887, + "num_tokens": 419535338.0, + "step": 10992 + }, + { + "epoch": 1.3984225925454776, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7411847114562988, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8662786483764648, + "num_tokens": 419569284.0, + "step": 10993 + }, + { + "epoch": 1.3985498028240682, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.4379284381866455, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.888256847858429, + "num_tokens": 419614014.0, + "step": 10994 + }, + { + "epoch": 1.3986770131026587, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.767623782157898, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8548606038093567, + "num_tokens": 419651833.0, + "step": 10995 + }, + { + "epoch": 1.3988042233812492, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6706628799438477, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8615306615829468, + "num_tokens": 419687726.0, + "step": 10996 + }, + { + "epoch": 1.3989314336598397, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7199554443359375, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.882239580154419, + "num_tokens": 419723262.0, + "step": 10997 + }, + { + "epoch": 1.3990586439384303, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.605096459388733, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.864867091178894, + "num_tokens": 419760994.0, + "step": 10998 + }, + { + "epoch": 1.3991858542170208, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.4778475761413574, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8853034973144531, + "num_tokens": 419800701.0, + "step": 10999 + }, + { + "epoch": 1.3993130644956113, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.4674242734909058, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8888086080551147, + "num_tokens": 419841688.0, + "step": 11000 + }, + { + "epoch": 1.3994402747742019, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6484096050262451, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.870635986328125, + "num_tokens": 419878022.0, + "step": 11001 + }, + { + "epoch": 1.3995674850527924, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.571547269821167, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8592101335525513, + "num_tokens": 419915586.0, + "step": 11002 + }, + { + "epoch": 1.399694695331383, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.543664813041687, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8725937008857727, + "num_tokens": 419956218.0, + "step": 11003 + }, + { + "epoch": 1.3998219056099732, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6521629095077515, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8667580485343933, + "num_tokens": 419992599.0, + "step": 11004 + }, + { + "epoch": 1.3999491158885637, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5218652486801147, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8547548055648804, + "num_tokens": 420036605.0, + "step": 11005 + }, + { + "epoch": 1.4000763261671543, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6352537870407104, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8696647882461548, + "num_tokens": 420073827.0, + "step": 11006 + }, + { + "epoch": 1.4002035364457448, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.4912142753601074, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.870302140712738, + "num_tokens": 420119793.0, + "step": 11007 + }, + { + "epoch": 1.4003307467243353, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6891714334487915, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8581424951553345, + "num_tokens": 420155933.0, + "step": 11008 + }, + { + "epoch": 1.4004579570029259, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5543359518051147, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8715234398841858, + "num_tokens": 420196288.0, + "step": 11009 + }, + { + "epoch": 1.4005851672815164, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.713811993598938, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8682676553726196, + "num_tokens": 420229297.0, + "step": 11010 + }, + { + "epoch": 1.400712377560107, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 16.83365249633789, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8606176972389221, + "num_tokens": 420265064.0, + "step": 11011 + }, + { + "epoch": 1.4008395878386974, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.8605281114578247, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8626405596733093, + "num_tokens": 420300786.0, + "step": 11012 + }, + { + "epoch": 1.4009667981172877, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6707048416137695, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8744614720344543, + "num_tokens": 420335963.0, + "step": 11013 + }, + { + "epoch": 1.4010940083958783, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5751737356185913, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.869140625, + "num_tokens": 420378396.0, + "step": 11014 + }, + { + "epoch": 1.4012212186744688, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.596511960029602, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8670716285705566, + "num_tokens": 420423649.0, + "step": 11015 + }, + { + "epoch": 1.4013484289530593, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.8452187776565552, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.864126443862915, + "num_tokens": 420457706.0, + "step": 11016 + }, + { + "epoch": 1.4014756392316499, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6341402530670166, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8709182143211365, + "num_tokens": 420493421.0, + "step": 11017 + }, + { + "epoch": 1.4016028495102404, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6062140464782715, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8682494759559631, + "num_tokens": 420533510.0, + "step": 11018 + }, + { + "epoch": 1.401730059788831, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5460304021835327, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8723340630531311, + "num_tokens": 420574633.0, + "step": 11019 + }, + { + "epoch": 1.4018572700674214, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6766412258148193, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8673421144485474, + "num_tokens": 420612117.0, + "step": 11020 + }, + { + "epoch": 1.401984480346012, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5830074548721313, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.871772050857544, + "num_tokens": 420648645.0, + "step": 11021 + }, + { + "epoch": 1.4021116906246025, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6142396926879883, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8747215867042542, + "num_tokens": 420687776.0, + "step": 11022 + }, + { + "epoch": 1.402238900903193, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.647719144821167, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8630173206329346, + "num_tokens": 420725266.0, + "step": 11023 + }, + { + "epoch": 1.4023661111817836, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.637753963470459, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8688530921936035, + "num_tokens": 420761600.0, + "step": 11024 + }, + { + "epoch": 1.402493321460374, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5379000902175903, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8749151229858398, + "num_tokens": 420801748.0, + "step": 11025 + }, + { + "epoch": 1.4026205317389646, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5344129800796509, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8773690462112427, + "num_tokens": 420841519.0, + "step": 11026 + }, + { + "epoch": 1.4027477420175551, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7142701148986816, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8721997737884521, + "num_tokens": 420873431.0, + "step": 11027 + }, + { + "epoch": 1.4028749522961454, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7706365585327148, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8656583428382874, + "num_tokens": 420904149.0, + "step": 11028 + }, + { + "epoch": 1.403002162574736, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6031510829925537, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8653564453125, + "num_tokens": 420945429.0, + "step": 11029 + }, + { + "epoch": 1.4031293728533265, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6811636686325073, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8706634044647217, + "num_tokens": 420983312.0, + "step": 11030 + }, + { + "epoch": 1.403256583131917, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6040706634521484, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8659226894378662, + "num_tokens": 421024836.0, + "step": 11031 + }, + { + "epoch": 1.4033837934105076, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6248160600662231, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.873401403427124, + "num_tokens": 421065979.0, + "step": 11032 + }, + { + "epoch": 1.403511003689098, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.617300033569336, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8740270733833313, + "num_tokens": 421104905.0, + "step": 11033 + }, + { + "epoch": 1.4036382139676886, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6003201007843018, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8800575733184814, + "num_tokens": 421146238.0, + "step": 11034 + }, + { + "epoch": 1.4037654242462791, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6969919204711914, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8699687719345093, + "num_tokens": 421186413.0, + "step": 11035 + }, + { + "epoch": 1.4038926345248697, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5924229621887207, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8821222186088562, + "num_tokens": 421223181.0, + "step": 11036 + }, + { + "epoch": 1.40401984480346, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.8538928031921387, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8451423645019531, + "num_tokens": 421255822.0, + "step": 11037 + }, + { + "epoch": 1.4041470550820505, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6145254373550415, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8639007806777954, + "num_tokens": 421293589.0, + "step": 11038 + }, + { + "epoch": 1.404274265360641, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.7121238708496094, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8677369356155396, + "num_tokens": 421330855.0, + "step": 11039 + }, + { + "epoch": 1.4044014756392316, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6038706302642822, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8639910221099854, + "num_tokens": 421370071.0, + "step": 11040 + }, + { + "epoch": 1.404528685917822, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6646289825439453, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8697540760040283, + "num_tokens": 421406324.0, + "step": 11041 + }, + { + "epoch": 1.4046558961964126, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6232479810714722, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.876427412033081, + "num_tokens": 421442762.0, + "step": 11042 + }, + { + "epoch": 1.4047831064750032, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.5779792070388794, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8850263357162476, + "num_tokens": 421477963.0, + "step": 11043 + }, + { + "epoch": 1.4049103167535937, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.9374336004257202, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8774837255477905, + "num_tokens": 421504868.0, + "step": 11044 + }, + { + "epoch": 1.4050375270321842, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6563688516616821, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8816250562667847, + "num_tokens": 421542157.0, + "step": 11045 + }, + { + "epoch": 1.4051647373107747, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.540296196937561, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8598901629447937, + "num_tokens": 421588259.0, + "step": 11046 + }, + { + "epoch": 1.4052919475893653, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6105420589447021, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8705613017082214, + "num_tokens": 421626867.0, + "step": 11047 + }, + { + "epoch": 1.4054191578679558, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6351836919784546, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.865862250328064, + "num_tokens": 421670001.0, + "step": 11048 + }, + { + "epoch": 1.4055463681465463, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7072266340255737, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8777250051498413, + "num_tokens": 421703933.0, + "step": 11049 + }, + { + "epoch": 1.4056735784251368, + "ewc_loss": 2.300739288330078e-05, + "grad_norm": 1.6340259313583374, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8698210716247559, + "num_tokens": 421741022.0, + "step": 11050 + }, + { + "epoch": 1.4058007887037274, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.795870304107666, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8673461675643921, + "num_tokens": 421777932.0, + "step": 11051 + }, + { + "epoch": 1.405927998982318, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.658311128616333, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8807942271232605, + "num_tokens": 421817225.0, + "step": 11052 + }, + { + "epoch": 1.4060552092609082, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.623019814491272, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8819818496704102, + "num_tokens": 421855226.0, + "step": 11053 + }, + { + "epoch": 1.4061824195394987, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6136292219161987, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8746299743652344, + "num_tokens": 421892157.0, + "step": 11054 + }, + { + "epoch": 1.4063096298180893, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6077214479446411, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8671992421150208, + "num_tokens": 421930940.0, + "step": 11055 + }, + { + "epoch": 1.4064368400966798, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6018582582473755, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8555338978767395, + "num_tokens": 421973133.0, + "step": 11056 + }, + { + "epoch": 1.4065640503752703, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.548797845840454, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8838451504707336, + "num_tokens": 422008897.0, + "step": 11057 + }, + { + "epoch": 1.4066912606538609, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6255367994308472, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8554939031600952, + "num_tokens": 422047280.0, + "step": 11058 + }, + { + "epoch": 1.4068184709324514, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.639302372932434, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8735053539276123, + "num_tokens": 422082541.0, + "step": 11059 + }, + { + "epoch": 1.406945681211042, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6228820085525513, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8739696145057678, + "num_tokens": 422118703.0, + "step": 11060 + }, + { + "epoch": 1.4070728914896324, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6456950902938843, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8747214078903198, + "num_tokens": 422157395.0, + "step": 11061 + }, + { + "epoch": 1.4072001017682227, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5495569705963135, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8709237575531006, + "num_tokens": 422200255.0, + "step": 11062 + }, + { + "epoch": 1.4073273120468133, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5666354894638062, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8715914487838745, + "num_tokens": 422239233.0, + "step": 11063 + }, + { + "epoch": 1.4074545223254038, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.703872799873352, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.867000937461853, + "num_tokens": 422271379.0, + "step": 11064 + }, + { + "epoch": 1.4075817326039943, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.601824164390564, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8830996155738831, + "num_tokens": 422305533.0, + "step": 11065 + }, + { + "epoch": 1.4077089428825849, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6405129432678223, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8598741292953491, + "num_tokens": 422343469.0, + "step": 11066 + }, + { + "epoch": 1.4078361531611754, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6847478151321411, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8599691390991211, + "num_tokens": 422381154.0, + "step": 11067 + }, + { + "epoch": 1.407963363439766, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.474812388420105, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8675833940505981, + "num_tokens": 422423816.0, + "step": 11068 + }, + { + "epoch": 1.4080905737183564, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6219110488891602, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8646571636199951, + "num_tokens": 422460427.0, + "step": 11069 + }, + { + "epoch": 1.408217783996947, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5473787784576416, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8657424449920654, + "num_tokens": 422501517.0, + "step": 11070 + }, + { + "epoch": 1.4083449942755375, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5959020853042603, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8621102571487427, + "num_tokens": 422542657.0, + "step": 11071 + }, + { + "epoch": 1.408472204554128, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6421947479248047, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8654202222824097, + "num_tokens": 422581574.0, + "step": 11072 + }, + { + "epoch": 1.4085994148327186, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6525964736938477, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8666167259216309, + "num_tokens": 422616781.0, + "step": 11073 + }, + { + "epoch": 1.408726625111309, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6678831577301025, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8719537258148193, + "num_tokens": 422651556.0, + "step": 11074 + }, + { + "epoch": 1.4088538353898996, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6740678548812866, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.875962495803833, + "num_tokens": 422687750.0, + "step": 11075 + }, + { + "epoch": 1.4089810456684901, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7712435722351074, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8701770305633545, + "num_tokens": 422719801.0, + "step": 11076 + }, + { + "epoch": 1.4091082559470804, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6216232776641846, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8687838315963745, + "num_tokens": 422762459.0, + "step": 11077 + }, + { + "epoch": 1.409235466225671, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.575217843055725, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8615596294403076, + "num_tokens": 422802303.0, + "step": 11078 + }, + { + "epoch": 1.4093626765042615, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5297784805297852, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8891743421554565, + "num_tokens": 422837655.0, + "step": 11079 + }, + { + "epoch": 1.409489886782852, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5356818437576294, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.853047788143158, + "num_tokens": 422881956.0, + "step": 11080 + }, + { + "epoch": 1.4096170970614426, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.694108009338379, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.86981201171875, + "num_tokens": 422914684.0, + "step": 11081 + }, + { + "epoch": 1.409744307340033, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7432589530944824, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8610250949859619, + "num_tokens": 422952882.0, + "step": 11082 + }, + { + "epoch": 1.4098715176186236, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 3.6495847702026367, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8808809518814087, + "num_tokens": 422997165.0, + "step": 11083 + }, + { + "epoch": 1.4099987278972141, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.526851773262024, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8720732927322388, + "num_tokens": 423042295.0, + "step": 11084 + }, + { + "epoch": 1.4101259381758047, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.584940791130066, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8812909126281738, + "num_tokens": 423080855.0, + "step": 11085 + }, + { + "epoch": 1.410253148454395, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6229819059371948, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8653286695480347, + "num_tokens": 423119977.0, + "step": 11086 + }, + { + "epoch": 1.4103803587329855, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.4895596504211426, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8730263710021973, + "num_tokens": 423164557.0, + "step": 11087 + }, + { + "epoch": 1.410507569011576, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5252879858016968, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8712041974067688, + "num_tokens": 423205838.0, + "step": 11088 + }, + { + "epoch": 1.4106347792901666, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6686192750930786, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.881537914276123, + "num_tokens": 423236192.0, + "step": 11089 + }, + { + "epoch": 1.410761989568757, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5022881031036377, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8649773597717285, + "num_tokens": 423276768.0, + "step": 11090 + }, + { + "epoch": 1.4108891998473476, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5722026824951172, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.870794415473938, + "num_tokens": 423315232.0, + "step": 11091 + }, + { + "epoch": 1.4110164101259381, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7538477182388306, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8708196878433228, + "num_tokens": 423349399.0, + "step": 11092 + }, + { + "epoch": 1.4111436204045287, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.8139046430587769, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8704239130020142, + "num_tokens": 423380224.0, + "step": 11093 + }, + { + "epoch": 1.4112708306831192, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5777101516723633, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8743195533752441, + "num_tokens": 423416514.0, + "step": 11094 + }, + { + "epoch": 1.4113980409617097, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6175072193145752, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8729931712150574, + "num_tokens": 423454383.0, + "step": 11095 + }, + { + "epoch": 1.4115252512403003, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5815074443817139, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8744888305664062, + "num_tokens": 423491270.0, + "step": 11096 + }, + { + "epoch": 1.4116524615188908, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.655826210975647, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8742561340332031, + "num_tokens": 423524656.0, + "step": 11097 + }, + { + "epoch": 1.4117796717974813, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.55294930934906, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8830719590187073, + "num_tokens": 423564845.0, + "step": 11098 + }, + { + "epoch": 1.4119068820760718, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.68332839012146, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8727242946624756, + "num_tokens": 423599590.0, + "step": 11099 + }, + { + "epoch": 1.4120340923546624, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.613280177116394, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8786659240722656, + "num_tokens": 423634966.0, + "step": 11100 + }, + { + "epoch": 1.412161302633253, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.713005781173706, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8558720350265503, + "num_tokens": 423674808.0, + "step": 11101 + }, + { + "epoch": 1.4122885129118432, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7645914554595947, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.870161771774292, + "num_tokens": 423710995.0, + "step": 11102 + }, + { + "epoch": 1.4124157231904337, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5031406879425049, + "learning_rate": 1e-06, + "loss": 0.3218, + "mean_token_accuracy": 0.8937724828720093, + "num_tokens": 423751404.0, + "step": 11103 + }, + { + "epoch": 1.4125429334690243, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6145473718643188, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.870418906211853, + "num_tokens": 423789964.0, + "step": 11104 + }, + { + "epoch": 1.4126701437476148, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6148700714111328, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8791599273681641, + "num_tokens": 423826871.0, + "step": 11105 + }, + { + "epoch": 1.4127973540262053, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.855440616607666, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8756461143493652, + "num_tokens": 423859591.0, + "step": 11106 + }, + { + "epoch": 1.4129245643047958, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6847776174545288, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8714057207107544, + "num_tokens": 423897999.0, + "step": 11107 + }, + { + "epoch": 1.4130517745833864, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7701534032821655, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8602607846260071, + "num_tokens": 423930799.0, + "step": 11108 + }, + { + "epoch": 1.413178984861977, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6544817686080933, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8652840852737427, + "num_tokens": 423972180.0, + "step": 11109 + }, + { + "epoch": 1.4133061951405674, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6943066120147705, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8703374862670898, + "num_tokens": 424010341.0, + "step": 11110 + }, + { + "epoch": 1.4134334054191577, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.701873779296875, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.864728569984436, + "num_tokens": 424044441.0, + "step": 11111 + }, + { + "epoch": 1.4135606156977483, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7059026956558228, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.860118567943573, + "num_tokens": 424079710.0, + "step": 11112 + }, + { + "epoch": 1.4136878259763388, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6870954036712646, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8655107021331787, + "num_tokens": 424116155.0, + "step": 11113 + }, + { + "epoch": 1.4138150362549293, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6625463962554932, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8750452995300293, + "num_tokens": 424152952.0, + "step": 11114 + }, + { + "epoch": 1.4139422465335199, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6752643585205078, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8753788471221924, + "num_tokens": 424191144.0, + "step": 11115 + }, + { + "epoch": 1.4140694568121104, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6042520999908447, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8674583435058594, + "num_tokens": 424226889.0, + "step": 11116 + }, + { + "epoch": 1.414196667090701, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5044234991073608, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8654978275299072, + "num_tokens": 424270123.0, + "step": 11117 + }, + { + "epoch": 1.4143238773692914, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.4762552976608276, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8716081380844116, + "num_tokens": 424313593.0, + "step": 11118 + }, + { + "epoch": 1.414451087647882, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5517523288726807, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8841782212257385, + "num_tokens": 424351779.0, + "step": 11119 + }, + { + "epoch": 1.4145782979264725, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7429068088531494, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8632316589355469, + "num_tokens": 424386050.0, + "step": 11120 + }, + { + "epoch": 1.414705508205063, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7561157941818237, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8621848225593567, + "num_tokens": 424418914.0, + "step": 11121 + }, + { + "epoch": 1.4148327184836536, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6516978740692139, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8747657537460327, + "num_tokens": 424455009.0, + "step": 11122 + }, + { + "epoch": 1.414959928762244, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5347890853881836, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8730034232139587, + "num_tokens": 424498050.0, + "step": 11123 + }, + { + "epoch": 1.4150871390408346, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7588491439819336, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8675978183746338, + "num_tokens": 424530993.0, + "step": 11124 + }, + { + "epoch": 1.4152143493194251, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5965193510055542, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8702682852745056, + "num_tokens": 424570223.0, + "step": 11125 + }, + { + "epoch": 1.4153415595980154, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5479477643966675, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8821056485176086, + "num_tokens": 424612526.0, + "step": 11126 + }, + { + "epoch": 1.415468769876606, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7249420881271362, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8596272468566895, + "num_tokens": 424651888.0, + "step": 11127 + }, + { + "epoch": 1.4155959801551965, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5116961002349854, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8759996891021729, + "num_tokens": 424690885.0, + "step": 11128 + }, + { + "epoch": 1.415723190433787, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5864427089691162, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8919612765312195, + "num_tokens": 424724340.0, + "step": 11129 + }, + { + "epoch": 1.4158504007123776, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6969207525253296, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8821362257003784, + "num_tokens": 424761327.0, + "step": 11130 + }, + { + "epoch": 1.415977610990968, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6749026775360107, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8725415468215942, + "num_tokens": 424798612.0, + "step": 11131 + }, + { + "epoch": 1.4161048212695586, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.525497555732727, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8679943680763245, + "num_tokens": 424839280.0, + "step": 11132 + }, + { + "epoch": 1.4162320315481491, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.568845510482788, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.861524224281311, + "num_tokens": 424877415.0, + "step": 11133 + }, + { + "epoch": 1.4163592418267397, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6179414987564087, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8704267740249634, + "num_tokens": 424911822.0, + "step": 11134 + }, + { + "epoch": 1.41648645210533, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.551568627357483, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8498141765594482, + "num_tokens": 424956144.0, + "step": 11135 + }, + { + "epoch": 1.4166136623839205, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5889794826507568, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8668652176856995, + "num_tokens": 424998299.0, + "step": 11136 + }, + { + "epoch": 1.416740872662511, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.614614486694336, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8819869756698608, + "num_tokens": 425033052.0, + "step": 11137 + }, + { + "epoch": 1.4168680829411016, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.672338604927063, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8921767473220825, + "num_tokens": 425069721.0, + "step": 11138 + }, + { + "epoch": 1.416995293219692, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4847615957260132, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8866675496101379, + "num_tokens": 425110708.0, + "step": 11139 + }, + { + "epoch": 1.4171225034982826, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6667675971984863, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8635570406913757, + "num_tokens": 425149321.0, + "step": 11140 + }, + { + "epoch": 1.4172497137768731, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.610226035118103, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8738448023796082, + "num_tokens": 425189084.0, + "step": 11141 + }, + { + "epoch": 1.4173769240554637, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.638113260269165, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8692327737808228, + "num_tokens": 425233250.0, + "step": 11142 + }, + { + "epoch": 1.4175041343340542, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6378278732299805, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8699958324432373, + "num_tokens": 425271762.0, + "step": 11143 + }, + { + "epoch": 1.4176313446126447, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5473352670669556, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8803064227104187, + "num_tokens": 425310057.0, + "step": 11144 + }, + { + "epoch": 1.4177585548912353, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5748459100723267, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8722540140151978, + "num_tokens": 425348327.0, + "step": 11145 + }, + { + "epoch": 1.4178857651698258, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6092851161956787, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8686176538467407, + "num_tokens": 425385843.0, + "step": 11146 + }, + { + "epoch": 1.4180129754484163, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7700704336166382, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8613409996032715, + "num_tokens": 425419454.0, + "step": 11147 + }, + { + "epoch": 1.4181401857270068, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7493315935134888, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8684610724449158, + "num_tokens": 425454293.0, + "step": 11148 + }, + { + "epoch": 1.4182673960055974, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5509299039840698, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8730483055114746, + "num_tokens": 425494206.0, + "step": 11149 + }, + { + "epoch": 1.418394606284188, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5970379114151, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8722643852233887, + "num_tokens": 425531757.0, + "step": 11150 + }, + { + "epoch": 1.4185218165627782, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7619175910949707, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8623576164245605, + "num_tokens": 425569974.0, + "step": 11151 + }, + { + "epoch": 1.4186490268413687, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6347172260284424, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.866244912147522, + "num_tokens": 425610319.0, + "step": 11152 + }, + { + "epoch": 1.4187762371199593, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.761874794960022, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8723870515823364, + "num_tokens": 425646109.0, + "step": 11153 + }, + { + "epoch": 1.4189034473985498, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5706992149353027, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.865198016166687, + "num_tokens": 425689049.0, + "step": 11154 + }, + { + "epoch": 1.4190306576771403, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.645714521408081, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8774356842041016, + "num_tokens": 425726059.0, + "step": 11155 + }, + { + "epoch": 1.4191578679557308, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.749839425086975, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8740718364715576, + "num_tokens": 425759280.0, + "step": 11156 + }, + { + "epoch": 1.4192850782343214, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.601872205734253, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8657544851303101, + "num_tokens": 425797932.0, + "step": 11157 + }, + { + "epoch": 1.419412288512912, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6145341396331787, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8753480911254883, + "num_tokens": 425832192.0, + "step": 11158 + }, + { + "epoch": 1.4195394987915024, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6651928424835205, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8634586930274963, + "num_tokens": 425869922.0, + "step": 11159 + }, + { + "epoch": 1.4196667090700927, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.593814492225647, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8649173974990845, + "num_tokens": 425905598.0, + "step": 11160 + }, + { + "epoch": 1.4197939193486833, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.580836296081543, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8742709159851074, + "num_tokens": 425942852.0, + "step": 11161 + }, + { + "epoch": 1.4199211296272738, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6101186275482178, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8716393709182739, + "num_tokens": 425982015.0, + "step": 11162 + }, + { + "epoch": 1.4200483399058643, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.670971393585205, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.860414445400238, + "num_tokens": 426019900.0, + "step": 11163 + }, + { + "epoch": 1.4201755501844548, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5911110639572144, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8582870960235596, + "num_tokens": 426060720.0, + "step": 11164 + }, + { + "epoch": 1.4203027604630454, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5348776578903198, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8736037015914917, + "num_tokens": 426101077.0, + "step": 11165 + }, + { + "epoch": 1.420429970741636, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6300047636032104, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8621182441711426, + "num_tokens": 426139890.0, + "step": 11166 + }, + { + "epoch": 1.4205571810202264, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5810730457305908, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8689110279083252, + "num_tokens": 426181838.0, + "step": 11167 + }, + { + "epoch": 1.420684391298817, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.569964051246643, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.862267255783081, + "num_tokens": 426223984.0, + "step": 11168 + }, + { + "epoch": 1.4208116015774075, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6474436521530151, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8852319717407227, + "num_tokens": 426257622.0, + "step": 11169 + }, + { + "epoch": 1.420938811855998, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6259242296218872, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8677127361297607, + "num_tokens": 426292796.0, + "step": 11170 + }, + { + "epoch": 1.4210660221345885, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6315927505493164, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.863981306552887, + "num_tokens": 426330101.0, + "step": 11171 + }, + { + "epoch": 1.421193232413179, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5219459533691406, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8797561526298523, + "num_tokens": 426372545.0, + "step": 11172 + }, + { + "epoch": 1.4213204426917696, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4993438720703125, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8722420930862427, + "num_tokens": 426415643.0, + "step": 11173 + }, + { + "epoch": 1.4214476529703601, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6192190647125244, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8735013008117676, + "num_tokens": 426453652.0, + "step": 11174 + }, + { + "epoch": 1.4215748632489504, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.640199899673462, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8704559803009033, + "num_tokens": 426490911.0, + "step": 11175 + }, + { + "epoch": 1.421702073527541, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5503106117248535, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8748601078987122, + "num_tokens": 426531305.0, + "step": 11176 + }, + { + "epoch": 1.4218292838061315, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.626024603843689, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8611553907394409, + "num_tokens": 426569808.0, + "step": 11177 + }, + { + "epoch": 1.421956494084722, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5969959497451782, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8796685934066772, + "num_tokens": 426604872.0, + "step": 11178 + }, + { + "epoch": 1.4220837043633126, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.602916955947876, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8560999631881714, + "num_tokens": 426645048.0, + "step": 11179 + }, + { + "epoch": 1.422210914641903, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6280288696289062, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8609976768493652, + "num_tokens": 426685713.0, + "step": 11180 + }, + { + "epoch": 1.4223381249204936, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.8369919061660767, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8613926768302917, + "num_tokens": 426718659.0, + "step": 11181 + }, + { + "epoch": 1.4224653351990841, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.69324791431427, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8615407943725586, + "num_tokens": 426758737.0, + "step": 11182 + }, + { + "epoch": 1.4225925454776747, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6656055450439453, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8712406158447266, + "num_tokens": 426798011.0, + "step": 11183 + }, + { + "epoch": 1.422719755756265, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.597611904144287, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8606799840927124, + "num_tokens": 426840664.0, + "step": 11184 + }, + { + "epoch": 1.4228469660348555, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.830068826675415, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8513036966323853, + "num_tokens": 426875658.0, + "step": 11185 + }, + { + "epoch": 1.422974176313446, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.589982032775879, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8780823945999146, + "num_tokens": 426912755.0, + "step": 11186 + }, + { + "epoch": 1.4231013865920366, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5214351415634155, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8645831942558289, + "num_tokens": 426951874.0, + "step": 11187 + }, + { + "epoch": 1.423228596870627, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6071003675460815, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8803216218948364, + "num_tokens": 426986864.0, + "step": 11188 + }, + { + "epoch": 1.4233558071492176, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6760783195495605, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8742131590843201, + "num_tokens": 427018967.0, + "step": 11189 + }, + { + "epoch": 1.4234830174278081, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5904847383499146, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8811507225036621, + "num_tokens": 427056556.0, + "step": 11190 + }, + { + "epoch": 1.4236102277063987, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.401007056236267, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8749332427978516, + "num_tokens": 427101688.0, + "step": 11191 + }, + { + "epoch": 1.4237374379849892, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5994737148284912, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8880690336227417, + "num_tokens": 427135328.0, + "step": 11192 + }, + { + "epoch": 1.4238646482635797, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5858362913131714, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8686655759811401, + "num_tokens": 427175487.0, + "step": 11193 + }, + { + "epoch": 1.4239918585421703, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.713596224784851, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8644805550575256, + "num_tokens": 427209591.0, + "step": 11194 + }, + { + "epoch": 1.4241190688207608, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7184932231903076, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8738062381744385, + "num_tokens": 427242390.0, + "step": 11195 + }, + { + "epoch": 1.4242462790993513, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5962544679641724, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8797063827514648, + "num_tokens": 427278242.0, + "step": 11196 + }, + { + "epoch": 1.4243734893779418, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.58148992061615, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8775374889373779, + "num_tokens": 427316117.0, + "step": 11197 + }, + { + "epoch": 1.4245006996565324, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5019416809082031, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8729689121246338, + "num_tokens": 427355945.0, + "step": 11198 + }, + { + "epoch": 1.424627909935123, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7143702507019043, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8574576377868652, + "num_tokens": 427390085.0, + "step": 11199 + }, + { + "epoch": 1.4247551202137132, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6263517141342163, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8741701245307922, + "num_tokens": 427433815.0, + "step": 11200 + }, + { + "epoch": 1.4248823304923037, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7148083448410034, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8616893291473389, + "num_tokens": 427477401.0, + "step": 11201 + }, + { + "epoch": 1.4250095407708943, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7211898565292358, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.876665472984314, + "num_tokens": 427513486.0, + "step": 11202 + }, + { + "epoch": 1.4251367510494848, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4981379508972168, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8797775506973267, + "num_tokens": 427556658.0, + "step": 11203 + }, + { + "epoch": 1.4252639613280753, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.576607346534729, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8788858652114868, + "num_tokens": 427596997.0, + "step": 11204 + }, + { + "epoch": 1.4253911716066658, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5097095966339111, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8677391409873962, + "num_tokens": 427640030.0, + "step": 11205 + }, + { + "epoch": 1.4255183818852564, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5797885656356812, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.877583384513855, + "num_tokens": 427676932.0, + "step": 11206 + }, + { + "epoch": 1.425645592163847, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6748360395431519, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8831653594970703, + "num_tokens": 427708591.0, + "step": 11207 + }, + { + "epoch": 1.4257728024424374, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.704548954963684, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.882374107837677, + "num_tokens": 427741275.0, + "step": 11208 + }, + { + "epoch": 1.4259000127210277, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6258419752120972, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8622773885726929, + "num_tokens": 427778098.0, + "step": 11209 + }, + { + "epoch": 1.4260272229996183, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6478424072265625, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8679539561271667, + "num_tokens": 427814580.0, + "step": 11210 + }, + { + "epoch": 1.4261544332782088, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6224838495254517, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8877019286155701, + "num_tokens": 427855253.0, + "step": 11211 + }, + { + "epoch": 1.4262816435567993, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5806000232696533, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8711822032928467, + "num_tokens": 427891542.0, + "step": 11212 + }, + { + "epoch": 1.4264088538353898, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5203379392623901, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8782016634941101, + "num_tokens": 427932693.0, + "step": 11213 + }, + { + "epoch": 1.4265360641139804, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7599546909332275, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8713621497154236, + "num_tokens": 427970154.0, + "step": 11214 + }, + { + "epoch": 1.426663274392571, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.551180362701416, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8799407482147217, + "num_tokens": 428008750.0, + "step": 11215 + }, + { + "epoch": 1.4267904846711614, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6228164434432983, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8614943027496338, + "num_tokens": 428046006.0, + "step": 11216 + }, + { + "epoch": 1.426917694949752, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6675434112548828, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8713992834091187, + "num_tokens": 428080719.0, + "step": 11217 + }, + { + "epoch": 1.4270449052283425, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6457663774490356, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.862546443939209, + "num_tokens": 428117976.0, + "step": 11218 + }, + { + "epoch": 1.427172115506933, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6800892353057861, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8788456916809082, + "num_tokens": 428155165.0, + "step": 11219 + }, + { + "epoch": 1.4272993257855235, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6031770706176758, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8690527081489563, + "num_tokens": 428192931.0, + "step": 11220 + }, + { + "epoch": 1.427426536064114, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.5197426080703735, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8867393732070923, + "num_tokens": 428229935.0, + "step": 11221 + }, + { + "epoch": 1.4275537463427046, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.640654444694519, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8627822399139404, + "num_tokens": 428268575.0, + "step": 11222 + }, + { + "epoch": 1.4276809566212951, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.7436004877090454, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8599665760993958, + "num_tokens": 428301990.0, + "step": 11223 + }, + { + "epoch": 1.4278081668998854, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.588869333267212, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8812466263771057, + "num_tokens": 428343027.0, + "step": 11224 + }, + { + "epoch": 1.427935377178476, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6163647174835205, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8613802194595337, + "num_tokens": 428382420.0, + "step": 11225 + }, + { + "epoch": 1.4280625874570665, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6825447082519531, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8835433721542358, + "num_tokens": 428411204.0, + "step": 11226 + }, + { + "epoch": 1.428189797735657, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6089810132980347, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8642215728759766, + "num_tokens": 428449755.0, + "step": 11227 + }, + { + "epoch": 1.4283170080142475, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5900604724884033, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8787100315093994, + "num_tokens": 428486415.0, + "step": 11228 + }, + { + "epoch": 1.428444218292838, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.639681339263916, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8746733665466309, + "num_tokens": 428521852.0, + "step": 11229 + }, + { + "epoch": 1.4285714285714286, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4767848253250122, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8779823184013367, + "num_tokens": 428566363.0, + "step": 11230 + }, + { + "epoch": 1.4286986388500191, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6275670528411865, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8617101907730103, + "num_tokens": 428604265.0, + "step": 11231 + }, + { + "epoch": 1.4288258491286097, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.535600185394287, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8772905468940735, + "num_tokens": 428643272.0, + "step": 11232 + }, + { + "epoch": 1.4289530594072, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.548641324043274, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8481256365776062, + "num_tokens": 428687209.0, + "step": 11233 + }, + { + "epoch": 1.4290802696857905, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7316585779190063, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8710760474205017, + "num_tokens": 428722880.0, + "step": 11234 + }, + { + "epoch": 1.429207479964381, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.4718730449676514, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8777160048484802, + "num_tokens": 428766784.0, + "step": 11235 + }, + { + "epoch": 1.4293346902429716, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6340134143829346, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8730471134185791, + "num_tokens": 428802856.0, + "step": 11236 + }, + { + "epoch": 1.429461900521562, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6411234140396118, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8746219873428345, + "num_tokens": 428839329.0, + "step": 11237 + }, + { + "epoch": 1.4295891108001526, + "ewc_loss": 2.3126602172851562e-05, + "grad_norm": 1.6786929368972778, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8614051938056946, + "num_tokens": 428876080.0, + "step": 11238 + }, + { + "epoch": 1.4297163210787431, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5890905857086182, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8596728444099426, + "num_tokens": 428915665.0, + "step": 11239 + }, + { + "epoch": 1.4298435313573337, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.691230058670044, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8798375129699707, + "num_tokens": 428950117.0, + "step": 11240 + }, + { + "epoch": 1.4299707416359242, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6275138854980469, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8782533407211304, + "num_tokens": 428992199.0, + "step": 11241 + }, + { + "epoch": 1.4300979519145147, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.621585726737976, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8802503943443298, + "num_tokens": 429029649.0, + "step": 11242 + }, + { + "epoch": 1.4302251621931052, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6143743991851807, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8544166684150696, + "num_tokens": 429070561.0, + "step": 11243 + }, + { + "epoch": 1.4303523724716958, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6142641305923462, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8763213157653809, + "num_tokens": 429108579.0, + "step": 11244 + }, + { + "epoch": 1.4304795827502863, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.637847661972046, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8637054562568665, + "num_tokens": 429146744.0, + "step": 11245 + }, + { + "epoch": 1.4306067930288768, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6334651708602905, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.881820559501648, + "num_tokens": 429184144.0, + "step": 11246 + }, + { + "epoch": 1.4307340033074674, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7210959196090698, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8817203044891357, + "num_tokens": 429215841.0, + "step": 11247 + }, + { + "epoch": 1.430861213586058, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6981542110443115, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8741838932037354, + "num_tokens": 429251842.0, + "step": 11248 + }, + { + "epoch": 1.4309884238646482, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5428895950317383, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8676995038986206, + "num_tokens": 429293470.0, + "step": 11249 + }, + { + "epoch": 1.4311156341432387, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5885446071624756, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8600826263427734, + "num_tokens": 429335070.0, + "step": 11250 + }, + { + "epoch": 1.4312428444218293, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.671764850616455, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.874186635017395, + "num_tokens": 429374170.0, + "step": 11251 + }, + { + "epoch": 1.4313700547004198, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5764615535736084, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8592549562454224, + "num_tokens": 429413368.0, + "step": 11252 + }, + { + "epoch": 1.4314972649790103, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5128223896026611, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.875272810459137, + "num_tokens": 429455010.0, + "step": 11253 + }, + { + "epoch": 1.4316244752576008, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6754285097122192, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8747820854187012, + "num_tokens": 429493346.0, + "step": 11254 + }, + { + "epoch": 1.4317516855361914, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5556800365447998, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.872873067855835, + "num_tokens": 429536916.0, + "step": 11255 + }, + { + "epoch": 1.431878895814782, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5846819877624512, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8901577591896057, + "num_tokens": 429570822.0, + "step": 11256 + }, + { + "epoch": 1.4320061060933724, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7107970714569092, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8676934242248535, + "num_tokens": 429607779.0, + "step": 11257 + }, + { + "epoch": 1.4321333163719627, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5360997915267944, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8759616613388062, + "num_tokens": 429650335.0, + "step": 11258 + }, + { + "epoch": 1.4322605266505533, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6945880651474, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8712024688720703, + "num_tokens": 429688137.0, + "step": 11259 + }, + { + "epoch": 1.4323877369291438, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.615963101387024, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8658512234687805, + "num_tokens": 429728387.0, + "step": 11260 + }, + { + "epoch": 1.4325149472077343, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6216870546340942, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8724297285079956, + "num_tokens": 429766763.0, + "step": 11261 + }, + { + "epoch": 1.4326421574863248, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6402325630187988, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8698214292526245, + "num_tokens": 429803381.0, + "step": 11262 + }, + { + "epoch": 1.4327693677649154, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5250616073608398, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8750653266906738, + "num_tokens": 429848461.0, + "step": 11263 + }, + { + "epoch": 1.432896578043506, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6700358390808105, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8663905262947083, + "num_tokens": 429883849.0, + "step": 11264 + }, + { + "epoch": 1.4330237883220964, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5869406461715698, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8703346252441406, + "num_tokens": 429922917.0, + "step": 11265 + }, + { + "epoch": 1.433150998600687, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5162274837493896, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8756592273712158, + "num_tokens": 429963878.0, + "step": 11266 + }, + { + "epoch": 1.4332782088792775, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6814929246902466, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8614029884338379, + "num_tokens": 429998051.0, + "step": 11267 + }, + { + "epoch": 1.433405419157868, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5359621047973633, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8840605020523071, + "num_tokens": 430037812.0, + "step": 11268 + }, + { + "epoch": 1.4335326294364585, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.705238699913025, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8900519013404846, + "num_tokens": 430074385.0, + "step": 11269 + }, + { + "epoch": 1.433659839715049, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6048915386199951, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8726691007614136, + "num_tokens": 430114494.0, + "step": 11270 + }, + { + "epoch": 1.4337870499936396, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4600142240524292, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8698620796203613, + "num_tokens": 430158912.0, + "step": 11271 + }, + { + "epoch": 1.4339142602722301, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.692853331565857, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8654241561889648, + "num_tokens": 430198724.0, + "step": 11272 + }, + { + "epoch": 1.4340414705508204, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5930253267288208, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8655019998550415, + "num_tokens": 430235226.0, + "step": 11273 + }, + { + "epoch": 1.434168680829411, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5030748844146729, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8700679540634155, + "num_tokens": 430278573.0, + "step": 11274 + }, + { + "epoch": 1.4342958911080015, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.627325415611267, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8726868629455566, + "num_tokens": 430315987.0, + "step": 11275 + }, + { + "epoch": 1.434423101386592, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4849659204483032, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8642134666442871, + "num_tokens": 430361922.0, + "step": 11276 + }, + { + "epoch": 1.4345503116651825, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4898983240127563, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8798630237579346, + "num_tokens": 430404798.0, + "step": 11277 + }, + { + "epoch": 1.434677521943773, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4462614059448242, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8751914501190186, + "num_tokens": 430448218.0, + "step": 11278 + }, + { + "epoch": 1.4348047322223636, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.564404010772705, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8757352828979492, + "num_tokens": 430487070.0, + "step": 11279 + }, + { + "epoch": 1.4349319425009541, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5973098278045654, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8583282232284546, + "num_tokens": 430529943.0, + "step": 11280 + }, + { + "epoch": 1.4350591527795447, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4644309282302856, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8839306235313416, + "num_tokens": 430572636.0, + "step": 11281 + }, + { + "epoch": 1.435186363058135, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7162609100341797, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8692796230316162, + "num_tokens": 430608297.0, + "step": 11282 + }, + { + "epoch": 1.4353135733367255, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6008033752441406, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.874154806137085, + "num_tokens": 430643240.0, + "step": 11283 + }, + { + "epoch": 1.435440783615316, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.8860652446746826, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8751674890518188, + "num_tokens": 430672298.0, + "step": 11284 + }, + { + "epoch": 1.4355679938939065, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5914558172225952, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8726754784584045, + "num_tokens": 430712391.0, + "step": 11285 + }, + { + "epoch": 1.435695204172497, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7806646823883057, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8802645802497864, + "num_tokens": 430750699.0, + "step": 11286 + }, + { + "epoch": 1.4358224144510876, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7413065433502197, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.85284823179245, + "num_tokens": 430789123.0, + "step": 11287 + }, + { + "epoch": 1.4359496247296781, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6852525472640991, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8592275381088257, + "num_tokens": 430828453.0, + "step": 11288 + }, + { + "epoch": 1.4360768350082687, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5166221857070923, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8705590963363647, + "num_tokens": 430870773.0, + "step": 11289 + }, + { + "epoch": 1.4362040452868592, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7353795766830444, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8698205351829529, + "num_tokens": 430907183.0, + "step": 11290 + }, + { + "epoch": 1.4363312555654497, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5977942943572998, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8769050240516663, + "num_tokens": 430943050.0, + "step": 11291 + }, + { + "epoch": 1.4364584658440402, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5691536664962769, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8686967492103577, + "num_tokens": 430983018.0, + "step": 11292 + }, + { + "epoch": 1.4365856761226308, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7668200731277466, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8592334985733032, + "num_tokens": 431019029.0, + "step": 11293 + }, + { + "epoch": 1.4367128864012213, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5449506044387817, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8621312975883484, + "num_tokens": 431061060.0, + "step": 11294 + }, + { + "epoch": 1.4368400966798118, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.787398099899292, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8760512471199036, + "num_tokens": 431090621.0, + "step": 11295 + }, + { + "epoch": 1.4369673069584024, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.602687120437622, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8674368262290955, + "num_tokens": 431129654.0, + "step": 11296 + }, + { + "epoch": 1.4370945172369929, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5164655447006226, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8874238729476929, + "num_tokens": 431165059.0, + "step": 11297 + }, + { + "epoch": 1.4372217275155832, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6744709014892578, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.839547872543335, + "num_tokens": 431203356.0, + "step": 11298 + }, + { + "epoch": 1.4373489377941737, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7308881282806396, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8886218667030334, + "num_tokens": 431246910.0, + "step": 11299 + }, + { + "epoch": 1.4374761480727642, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.684108018875122, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8602588176727295, + "num_tokens": 431289224.0, + "step": 11300 + }, + { + "epoch": 1.4376033583513548, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4702157974243164, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8787897825241089, + "num_tokens": 431331240.0, + "step": 11301 + }, + { + "epoch": 1.4377305686299453, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6213295459747314, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8583090305328369, + "num_tokens": 431373239.0, + "step": 11302 + }, + { + "epoch": 1.4378577789085358, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5072910785675049, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8762180209159851, + "num_tokens": 431415194.0, + "step": 11303 + }, + { + "epoch": 1.4379849891871264, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.3976811170578003, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8799919486045837, + "num_tokens": 431461099.0, + "step": 11304 + }, + { + "epoch": 1.438112199465717, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.742257833480835, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8748615980148315, + "num_tokens": 431497380.0, + "step": 11305 + }, + { + "epoch": 1.4382394097443074, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.701257586479187, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.871748685836792, + "num_tokens": 431534011.0, + "step": 11306 + }, + { + "epoch": 1.4383666200228977, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6442146301269531, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8664734959602356, + "num_tokens": 431568797.0, + "step": 11307 + }, + { + "epoch": 1.4384938303014883, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.8556102514266968, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8792183995246887, + "num_tokens": 431596269.0, + "step": 11308 + }, + { + "epoch": 1.4386210405800788, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.66199791431427, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8822418451309204, + "num_tokens": 431633173.0, + "step": 11309 + }, + { + "epoch": 1.4387482508586693, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6001678705215454, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8766695857048035, + "num_tokens": 431668291.0, + "step": 11310 + }, + { + "epoch": 1.4388754611372598, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5955034494400024, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8624718189239502, + "num_tokens": 431708707.0, + "step": 11311 + }, + { + "epoch": 1.4390026714158504, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.674536108970642, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8745513558387756, + "num_tokens": 431742896.0, + "step": 11312 + }, + { + "epoch": 1.439129881694441, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.558493733406067, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8692929744720459, + "num_tokens": 431781375.0, + "step": 11313 + }, + { + "epoch": 1.4392570919730314, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.832348108291626, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8713030815124512, + "num_tokens": 431818625.0, + "step": 11314 + }, + { + "epoch": 1.439384302251622, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7887589931488037, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8543046116828918, + "num_tokens": 431858704.0, + "step": 11315 + }, + { + "epoch": 1.4395115125302125, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6272125244140625, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.873031735420227, + "num_tokens": 431897049.0, + "step": 11316 + }, + { + "epoch": 1.439638722808803, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6847597360610962, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8729134798049927, + "num_tokens": 431932920.0, + "step": 11317 + }, + { + "epoch": 1.4397659330873935, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7215547561645508, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8685639500617981, + "num_tokens": 431969005.0, + "step": 11318 + }, + { + "epoch": 1.439893143365984, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.462683916091919, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8751591444015503, + "num_tokens": 432013106.0, + "step": 11319 + }, + { + "epoch": 1.4400203536445746, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6167184114456177, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8716163635253906, + "num_tokens": 432051216.0, + "step": 11320 + }, + { + "epoch": 1.4401475639231651, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6226376295089722, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8747437000274658, + "num_tokens": 432088660.0, + "step": 11321 + }, + { + "epoch": 1.4402747742017554, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6090328693389893, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8600881695747375, + "num_tokens": 432132446.0, + "step": 11322 + }, + { + "epoch": 1.440401984480346, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7695714235305786, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8692379593849182, + "num_tokens": 432165750.0, + "step": 11323 + }, + { + "epoch": 1.4405291947589365, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6756454706192017, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.86594158411026, + "num_tokens": 432204658.0, + "step": 11324 + }, + { + "epoch": 1.440656405037527, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6144038438796997, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.87392657995224, + "num_tokens": 432245120.0, + "step": 11325 + }, + { + "epoch": 1.4407836153161175, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5580532550811768, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8706046342849731, + "num_tokens": 432285415.0, + "step": 11326 + }, + { + "epoch": 1.440910825594708, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5607328414916992, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8796474933624268, + "num_tokens": 432323372.0, + "step": 11327 + }, + { + "epoch": 1.4410380358732986, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5641030073165894, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8597497940063477, + "num_tokens": 432365109.0, + "step": 11328 + }, + { + "epoch": 1.4411652461518891, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6549562215805054, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8606259226799011, + "num_tokens": 432403020.0, + "step": 11329 + }, + { + "epoch": 1.4412924564304797, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4800316095352173, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.857267439365387, + "num_tokens": 432451630.0, + "step": 11330 + }, + { + "epoch": 1.44141966670907, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.594890832901001, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8576347827911377, + "num_tokens": 432494072.0, + "step": 11331 + }, + { + "epoch": 1.4415468769876605, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5675137042999268, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8813998699188232, + "num_tokens": 432530248.0, + "step": 11332 + }, + { + "epoch": 1.441674087266251, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6079094409942627, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8504904508590698, + "num_tokens": 432572641.0, + "step": 11333 + }, + { + "epoch": 1.4418012975448415, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5994378328323364, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8626624345779419, + "num_tokens": 432614975.0, + "step": 11334 + }, + { + "epoch": 1.441928507823432, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5484015941619873, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.878820538520813, + "num_tokens": 432654876.0, + "step": 11335 + }, + { + "epoch": 1.4420557181020226, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.701012134552002, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8670082092285156, + "num_tokens": 432691951.0, + "step": 11336 + }, + { + "epoch": 1.4421829283806131, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.587062120437622, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8604223728179932, + "num_tokens": 432732338.0, + "step": 11337 + }, + { + "epoch": 1.4423101386592037, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.586700201034546, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8674231767654419, + "num_tokens": 432775461.0, + "step": 11338 + }, + { + "epoch": 1.4424373489377942, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5910453796386719, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8511527180671692, + "num_tokens": 432817225.0, + "step": 11339 + }, + { + "epoch": 1.4425645592163847, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6013801097869873, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8504298329353333, + "num_tokens": 432860463.0, + "step": 11340 + }, + { + "epoch": 1.4426917694949752, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5927610397338867, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8729288578033447, + "num_tokens": 432896898.0, + "step": 11341 + }, + { + "epoch": 1.4428189797735658, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7063068151474, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8654571771621704, + "num_tokens": 432932378.0, + "step": 11342 + }, + { + "epoch": 1.4429461900521563, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.640632152557373, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8599939346313477, + "num_tokens": 432968000.0, + "step": 11343 + }, + { + "epoch": 1.4430734003307468, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.601238489151001, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8779458999633789, + "num_tokens": 433006746.0, + "step": 11344 + }, + { + "epoch": 1.4432006106093374, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6688499450683594, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8696799278259277, + "num_tokens": 433043656.0, + "step": 11345 + }, + { + "epoch": 1.4433278208879279, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6370024681091309, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8664435744285583, + "num_tokens": 433081932.0, + "step": 11346 + }, + { + "epoch": 1.4434550311665182, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 3.7323620319366455, + "learning_rate": 1e-06, + "loss": 0.3214, + "mean_token_accuracy": 0.8939441442489624, + "num_tokens": 433115178.0, + "step": 11347 + }, + { + "epoch": 1.4435822414451087, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.8090720176696777, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8624801635742188, + "num_tokens": 433150025.0, + "step": 11348 + }, + { + "epoch": 1.4437094517236992, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7125461101531982, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8772979974746704, + "num_tokens": 433184201.0, + "step": 11349 + }, + { + "epoch": 1.4438366620022898, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.55043625831604, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8693876266479492, + "num_tokens": 433224071.0, + "step": 11350 + }, + { + "epoch": 1.4439638722808803, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6256461143493652, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8632968664169312, + "num_tokens": 433262784.0, + "step": 11351 + }, + { + "epoch": 1.4440910825594708, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6929957866668701, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8862462043762207, + "num_tokens": 433294562.0, + "step": 11352 + }, + { + "epoch": 1.4442182928380614, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6013476848602295, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8713670372962952, + "num_tokens": 433332918.0, + "step": 11353 + }, + { + "epoch": 1.4443455031166519, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6613848209381104, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8599637746810913, + "num_tokens": 433368951.0, + "step": 11354 + }, + { + "epoch": 1.4444727133952424, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.525161862373352, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8815293312072754, + "num_tokens": 433406957.0, + "step": 11355 + }, + { + "epoch": 1.4445999236738327, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6614104509353638, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8726186752319336, + "num_tokens": 433447244.0, + "step": 11356 + }, + { + "epoch": 1.4447271339524232, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7100250720977783, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8520312309265137, + "num_tokens": 433487504.0, + "step": 11357 + }, + { + "epoch": 1.4448543442310138, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5499236583709717, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8732579350471497, + "num_tokens": 433529751.0, + "step": 11358 + }, + { + "epoch": 1.4449815545096043, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6745903491973877, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8662813305854797, + "num_tokens": 433566728.0, + "step": 11359 + }, + { + "epoch": 1.4451087647881948, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.668239712715149, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8565032482147217, + "num_tokens": 433604911.0, + "step": 11360 + }, + { + "epoch": 1.4452359750667854, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5107923746109009, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8825972080230713, + "num_tokens": 433644856.0, + "step": 11361 + }, + { + "epoch": 1.445363185345376, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4712365865707397, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8637505173683167, + "num_tokens": 433687630.0, + "step": 11362 + }, + { + "epoch": 1.4454903956239664, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.750783920288086, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8638503551483154, + "num_tokens": 433726918.0, + "step": 11363 + }, + { + "epoch": 1.445617605902557, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5910145044326782, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8659237027168274, + "num_tokens": 433767332.0, + "step": 11364 + }, + { + "epoch": 1.4457448161811475, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5789377689361572, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.886470377445221, + "num_tokens": 433801509.0, + "step": 11365 + }, + { + "epoch": 1.445872026459738, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7848007678985596, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8694683313369751, + "num_tokens": 433837288.0, + "step": 11366 + }, + { + "epoch": 1.4459992367383285, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5570441484451294, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.878105640411377, + "num_tokens": 433875765.0, + "step": 11367 + }, + { + "epoch": 1.446126447016919, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4487723112106323, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8865038156509399, + "num_tokens": 433914354.0, + "step": 11368 + }, + { + "epoch": 1.4462536572955096, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.572585105895996, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8599062561988831, + "num_tokens": 433955035.0, + "step": 11369 + }, + { + "epoch": 1.4463808675741001, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6855069398880005, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8721691370010376, + "num_tokens": 433987818.0, + "step": 11370 + }, + { + "epoch": 1.4465080778526904, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6422394514083862, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8754358291625977, + "num_tokens": 434026841.0, + "step": 11371 + }, + { + "epoch": 1.446635288131281, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5132488012313843, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8871123194694519, + "num_tokens": 434065614.0, + "step": 11372 + }, + { + "epoch": 1.4467624984098715, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5286778211593628, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8730545043945312, + "num_tokens": 434106580.0, + "step": 11373 + }, + { + "epoch": 1.446889708688462, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6289715766906738, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8659132719039917, + "num_tokens": 434143609.0, + "step": 11374 + }, + { + "epoch": 1.4470169189670525, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 3.6669561862945557, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8870171308517456, + "num_tokens": 434180668.0, + "step": 11375 + }, + { + "epoch": 1.447144129245643, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5635432004928589, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8923183679580688, + "num_tokens": 434216126.0, + "step": 11376 + }, + { + "epoch": 1.4472713395242336, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5860871076583862, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8873265981674194, + "num_tokens": 434255114.0, + "step": 11377 + }, + { + "epoch": 1.4473985498028241, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.8101173639297485, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8489891290664673, + "num_tokens": 434290342.0, + "step": 11378 + }, + { + "epoch": 1.4475257600814146, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5278459787368774, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8754286170005798, + "num_tokens": 434333470.0, + "step": 11379 + }, + { + "epoch": 1.447652970360005, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.8894213438034058, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8705690503120422, + "num_tokens": 434364223.0, + "step": 11380 + }, + { + "epoch": 1.4477801806385955, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5589218139648438, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8779518604278564, + "num_tokens": 434401601.0, + "step": 11381 + }, + { + "epoch": 1.447907390917186, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5156251192092896, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8748940825462341, + "num_tokens": 434444160.0, + "step": 11382 + }, + { + "epoch": 1.4480346011957765, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5713481903076172, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8765028715133667, + "num_tokens": 434482141.0, + "step": 11383 + }, + { + "epoch": 1.448161811474367, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5024062395095825, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.889115571975708, + "num_tokens": 434518817.0, + "step": 11384 + }, + { + "epoch": 1.4482890217529576, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5488595962524414, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8683012127876282, + "num_tokens": 434561783.0, + "step": 11385 + }, + { + "epoch": 1.4484162320315481, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6696656942367554, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8563586473464966, + "num_tokens": 434599730.0, + "step": 11386 + }, + { + "epoch": 1.4485434423101387, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6354279518127441, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8598287105560303, + "num_tokens": 434640657.0, + "step": 11387 + }, + { + "epoch": 1.4486706525887292, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.540907382965088, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8837097883224487, + "num_tokens": 434679851.0, + "step": 11388 + }, + { + "epoch": 1.4487978628673197, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6938865184783936, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8685599565505981, + "num_tokens": 434716080.0, + "step": 11389 + }, + { + "epoch": 1.4489250731459102, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4857110977172852, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.879332423210144, + "num_tokens": 434757965.0, + "step": 11390 + }, + { + "epoch": 1.4490522834245008, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6247026920318604, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8665652275085449, + "num_tokens": 434796652.0, + "step": 11391 + }, + { + "epoch": 1.4491794937030913, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5302448272705078, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8730831146240234, + "num_tokens": 434836689.0, + "step": 11392 + }, + { + "epoch": 1.4493067039816818, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5609153509140015, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8748072981834412, + "num_tokens": 434875075.0, + "step": 11393 + }, + { + "epoch": 1.4494339142602723, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6294268369674683, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.868414580821991, + "num_tokens": 434914311.0, + "step": 11394 + }, + { + "epoch": 1.4495611245388629, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7013837099075317, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8593921661376953, + "num_tokens": 434954289.0, + "step": 11395 + }, + { + "epoch": 1.4496883348174532, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6520155668258667, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8629894256591797, + "num_tokens": 434990657.0, + "step": 11396 + }, + { + "epoch": 1.4498155450960437, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4858002662658691, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.886930525302887, + "num_tokens": 435030150.0, + "step": 11397 + }, + { + "epoch": 1.4499427553746342, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6611506938934326, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.870344340801239, + "num_tokens": 435070050.0, + "step": 11398 + }, + { + "epoch": 1.4500699656532248, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.648435115814209, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8692591190338135, + "num_tokens": 435110846.0, + "step": 11399 + }, + { + "epoch": 1.4501971759318153, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6963918209075928, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8591605424880981, + "num_tokens": 435149906.0, + "step": 11400 + }, + { + "epoch": 1.4503243862104058, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5140376091003418, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8841474056243896, + "num_tokens": 435193082.0, + "step": 11401 + }, + { + "epoch": 1.4504515964889964, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7862859964370728, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8759126663208008, + "num_tokens": 435223428.0, + "step": 11402 + }, + { + "epoch": 1.4505788067675869, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7277413606643677, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8676184415817261, + "num_tokens": 435266426.0, + "step": 11403 + }, + { + "epoch": 1.4507060170461774, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5930300951004028, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8656465411186218, + "num_tokens": 435307027.0, + "step": 11404 + }, + { + "epoch": 1.4508332273247677, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.3485339879989624, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8862653374671936, + "num_tokens": 435349127.0, + "step": 11405 + }, + { + "epoch": 1.4509604376033582, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6975865364074707, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8723149299621582, + "num_tokens": 435384523.0, + "step": 11406 + }, + { + "epoch": 1.4510876478819488, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.643115520477295, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8650522232055664, + "num_tokens": 435423504.0, + "step": 11407 + }, + { + "epoch": 1.4512148581605393, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7074519395828247, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8714406490325928, + "num_tokens": 435454798.0, + "step": 11408 + }, + { + "epoch": 1.4513420684391298, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.661780834197998, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8879142999649048, + "num_tokens": 435490678.0, + "step": 11409 + }, + { + "epoch": 1.4514692787177204, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.576971411705017, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.880469799041748, + "num_tokens": 435527010.0, + "step": 11410 + }, + { + "epoch": 1.4515964889963109, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4927301406860352, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8741592168807983, + "num_tokens": 435568706.0, + "step": 11411 + }, + { + "epoch": 1.4517236992749014, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6190392971038818, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8746048212051392, + "num_tokens": 435607029.0, + "step": 11412 + }, + { + "epoch": 1.451850909553492, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.479600191116333, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8806459307670593, + "num_tokens": 435646380.0, + "step": 11413 + }, + { + "epoch": 1.4519781198320825, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5787149667739868, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8692213296890259, + "num_tokens": 435685701.0, + "step": 11414 + }, + { + "epoch": 1.452105330110673, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.698754072189331, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8694854974746704, + "num_tokens": 435723123.0, + "step": 11415 + }, + { + "epoch": 1.4522325403892635, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7931241989135742, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8452529311180115, + "num_tokens": 435761013.0, + "step": 11416 + }, + { + "epoch": 1.452359750667854, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.8162972927093506, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8795937299728394, + "num_tokens": 435791471.0, + "step": 11417 + }, + { + "epoch": 1.4524869609464446, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.484961748123169, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8763492107391357, + "num_tokens": 435837638.0, + "step": 11418 + }, + { + "epoch": 1.4526141712250351, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7504149675369263, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.856812059879303, + "num_tokens": 435871819.0, + "step": 11419 + }, + { + "epoch": 1.4527413815036254, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6418617963790894, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8676730394363403, + "num_tokens": 435911317.0, + "step": 11420 + }, + { + "epoch": 1.452868591782216, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.740321397781372, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8445389270782471, + "num_tokens": 435952672.0, + "step": 11421 + }, + { + "epoch": 1.4529958020608065, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6326922178268433, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8639005422592163, + "num_tokens": 435992608.0, + "step": 11422 + }, + { + "epoch": 1.453123012339397, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4602527618408203, + "learning_rate": 1e-06, + "loss": 0.3033, + "mean_token_accuracy": 0.9002982974052429, + "num_tokens": 436026936.0, + "step": 11423 + }, + { + "epoch": 1.4532502226179875, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5272579193115234, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.877363383769989, + "num_tokens": 436065598.0, + "step": 11424 + }, + { + "epoch": 1.453377432896578, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4688957929611206, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.866999626159668, + "num_tokens": 436112669.0, + "step": 11425 + }, + { + "epoch": 1.4535046431751686, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.507153034210205, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8704724311828613, + "num_tokens": 436153345.0, + "step": 11426 + }, + { + "epoch": 1.4536318534537591, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.522627830505371, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8665276765823364, + "num_tokens": 436198814.0, + "step": 11427 + }, + { + "epoch": 1.4537590637323496, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.660626769065857, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8862727880477905, + "num_tokens": 436233394.0, + "step": 11428 + }, + { + "epoch": 1.45388627401094, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4299813508987427, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8829319477081299, + "num_tokens": 436273483.0, + "step": 11429 + }, + { + "epoch": 1.4540134842895305, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5096604824066162, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8765057325363159, + "num_tokens": 436313280.0, + "step": 11430 + }, + { + "epoch": 1.454140694568121, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.552079439163208, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8738219141960144, + "num_tokens": 436352952.0, + "step": 11431 + }, + { + "epoch": 1.4542679048467115, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6215285062789917, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8743571043014526, + "num_tokens": 436388290.0, + "step": 11432 + }, + { + "epoch": 1.454395115125302, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.865762710571289, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8642423152923584, + "num_tokens": 436421223.0, + "step": 11433 + }, + { + "epoch": 1.4545223254038926, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5574942827224731, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8675529360771179, + "num_tokens": 436462027.0, + "step": 11434 + }, + { + "epoch": 1.4546495356824831, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7533795833587646, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8672442436218262, + "num_tokens": 436494430.0, + "step": 11435 + }, + { + "epoch": 1.4547767459610736, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.642960786819458, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.850139856338501, + "num_tokens": 436536383.0, + "step": 11436 + }, + { + "epoch": 1.4549039562396642, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.912524938583374, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8788906335830688, + "num_tokens": 436566643.0, + "step": 11437 + }, + { + "epoch": 1.4550311665182547, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6569440364837646, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8723753094673157, + "num_tokens": 436600469.0, + "step": 11438 + }, + { + "epoch": 1.4551583767968452, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6095165014266968, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8780628442764282, + "num_tokens": 436635256.0, + "step": 11439 + }, + { + "epoch": 1.4552855870754358, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5930707454681396, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8715351819992065, + "num_tokens": 436671026.0, + "step": 11440 + }, + { + "epoch": 1.4554127973540263, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6401159763336182, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8662140965461731, + "num_tokens": 436710183.0, + "step": 11441 + }, + { + "epoch": 1.4555400076326168, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7013357877731323, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8560522198677063, + "num_tokens": 436749792.0, + "step": 11442 + }, + { + "epoch": 1.4556672179112073, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6188884973526, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8842272162437439, + "num_tokens": 436787831.0, + "step": 11443 + }, + { + "epoch": 1.4557944281897979, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6699261665344238, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8678936958312988, + "num_tokens": 436822830.0, + "step": 11444 + }, + { + "epoch": 1.4559216384683882, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6303956508636475, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8633933663368225, + "num_tokens": 436864097.0, + "step": 11445 + }, + { + "epoch": 1.4560488487469787, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5695013999938965, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8610608577728271, + "num_tokens": 436904550.0, + "step": 11446 + }, + { + "epoch": 1.4561760590255692, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6833001375198364, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8680313229560852, + "num_tokens": 436939799.0, + "step": 11447 + }, + { + "epoch": 1.4563032693041598, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7211475372314453, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8707712888717651, + "num_tokens": 436974313.0, + "step": 11448 + }, + { + "epoch": 1.4564304795827503, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6252057552337646, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8879072070121765, + "num_tokens": 437007000.0, + "step": 11449 + }, + { + "epoch": 1.4565576898613408, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6591607332229614, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8802958726882935, + "num_tokens": 437045311.0, + "step": 11450 + }, + { + "epoch": 1.4566849001399313, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5945203304290771, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8794029951095581, + "num_tokens": 437085897.0, + "step": 11451 + }, + { + "epoch": 1.4568121104185219, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7361295223236084, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8776412010192871, + "num_tokens": 437117898.0, + "step": 11452 + }, + { + "epoch": 1.4569393206971124, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5869303941726685, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8700209856033325, + "num_tokens": 437156358.0, + "step": 11453 + }, + { + "epoch": 1.4570665309757027, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.749543309211731, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8609731197357178, + "num_tokens": 437188544.0, + "step": 11454 + }, + { + "epoch": 1.4571937412542932, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6730382442474365, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8786208629608154, + "num_tokens": 437221470.0, + "step": 11455 + }, + { + "epoch": 1.4573209515328838, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7156745195388794, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8798251152038574, + "num_tokens": 437257171.0, + "step": 11456 + }, + { + "epoch": 1.4574481618114743, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.737216830253601, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8672130107879639, + "num_tokens": 437290719.0, + "step": 11457 + }, + { + "epoch": 1.4575753720900648, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5483556985855103, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8609638214111328, + "num_tokens": 437331979.0, + "step": 11458 + }, + { + "epoch": 1.4577025823686554, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.659376859664917, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.861548900604248, + "num_tokens": 437373209.0, + "step": 11459 + }, + { + "epoch": 1.4578297926472459, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.571367859840393, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8633391857147217, + "num_tokens": 437413787.0, + "step": 11460 + }, + { + "epoch": 1.4579570029258364, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.658955693244934, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8718675374984741, + "num_tokens": 437448430.0, + "step": 11461 + }, + { + "epoch": 1.458084213204427, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5141010284423828, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8765550851821899, + "num_tokens": 437490800.0, + "step": 11462 + }, + { + "epoch": 1.4582114234830175, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5630877017974854, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.860859751701355, + "num_tokens": 437531973.0, + "step": 11463 + }, + { + "epoch": 1.458338633761608, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7000508308410645, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8608757853507996, + "num_tokens": 437566864.0, + "step": 11464 + }, + { + "epoch": 1.4584658440401985, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4508167505264282, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8856221437454224, + "num_tokens": 437609266.0, + "step": 11465 + }, + { + "epoch": 1.458593054318789, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5029466152191162, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.876796305179596, + "num_tokens": 437651232.0, + "step": 11466 + }, + { + "epoch": 1.4587202645973796, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.643004059791565, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8683951497077942, + "num_tokens": 437691186.0, + "step": 11467 + }, + { + "epoch": 1.45884747487597, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.649390459060669, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8620902299880981, + "num_tokens": 437730964.0, + "step": 11468 + }, + { + "epoch": 1.4589746851545604, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.520154595375061, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8745205998420715, + "num_tokens": 437769258.0, + "step": 11469 + }, + { + "epoch": 1.459101895433151, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5975152254104614, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8749147057533264, + "num_tokens": 437802358.0, + "step": 11470 + }, + { + "epoch": 1.4592291057117415, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.533410906791687, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8735461831092834, + "num_tokens": 437843978.0, + "step": 11471 + }, + { + "epoch": 1.459356315990332, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.7170155048370361, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.865828275680542, + "num_tokens": 437877292.0, + "step": 11472 + }, + { + "epoch": 1.4594835262689225, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5019651651382446, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8735679984092712, + "num_tokens": 437917776.0, + "step": 11473 + }, + { + "epoch": 1.459610736547513, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4625341892242432, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8857871294021606, + "num_tokens": 437959322.0, + "step": 11474 + }, + { + "epoch": 1.4597379468261036, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.679002285003662, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8535497188568115, + "num_tokens": 437995080.0, + "step": 11475 + }, + { + "epoch": 1.4598651571046941, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.881939172744751, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8641070127487183, + "num_tokens": 438036309.0, + "step": 11476 + }, + { + "epoch": 1.4599923673832846, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6770861148834229, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8768166303634644, + "num_tokens": 438070082.0, + "step": 11477 + }, + { + "epoch": 1.460119577661875, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.4428515434265137, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8755824565887451, + "num_tokens": 438113881.0, + "step": 11478 + }, + { + "epoch": 1.4602467879404655, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6649165153503418, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8636650443077087, + "num_tokens": 438154564.0, + "step": 11479 + }, + { + "epoch": 1.460373998219056, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6604331731796265, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8701665997505188, + "num_tokens": 438194084.0, + "step": 11480 + }, + { + "epoch": 1.4605012084976465, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6502704620361328, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8752223253250122, + "num_tokens": 438230465.0, + "step": 11481 + }, + { + "epoch": 1.460628418776237, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5003057718276978, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8883562684059143, + "num_tokens": 438268311.0, + "step": 11482 + }, + { + "epoch": 1.4607556290548276, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6775994300842285, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.878549337387085, + "num_tokens": 438300198.0, + "step": 11483 + }, + { + "epoch": 1.4608828393334181, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.71274995803833, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.875618577003479, + "num_tokens": 438331436.0, + "step": 11484 + }, + { + "epoch": 1.4610100496120086, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.536880373954773, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8854192495346069, + "num_tokens": 438369018.0, + "step": 11485 + }, + { + "epoch": 1.4611372598905992, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.700347661972046, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8672833442687988, + "num_tokens": 438405876.0, + "step": 11486 + }, + { + "epoch": 1.4612644701691897, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5485659837722778, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8828842639923096, + "num_tokens": 438446565.0, + "step": 11487 + }, + { + "epoch": 1.4613916804477802, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5218487977981567, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8816365003585815, + "num_tokens": 438488834.0, + "step": 11488 + }, + { + "epoch": 1.4615188907263708, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5942598581314087, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8624322414398193, + "num_tokens": 438527571.0, + "step": 11489 + }, + { + "epoch": 1.4616461010049613, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.6064037084579468, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8650888204574585, + "num_tokens": 438563811.0, + "step": 11490 + }, + { + "epoch": 1.4617733112835518, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7147144079208374, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8529796600341797, + "num_tokens": 438597498.0, + "step": 11491 + }, + { + "epoch": 1.4619005215621423, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5472017526626587, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8527204990386963, + "num_tokens": 438640534.0, + "step": 11492 + }, + { + "epoch": 1.4620277318407329, + "ewc_loss": 2.3245811462402344e-05, + "grad_norm": 1.5581862926483154, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8578997254371643, + "num_tokens": 438682763.0, + "step": 11493 + }, + { + "epoch": 1.4621549421193232, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6493523120880127, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8686479926109314, + "num_tokens": 438722034.0, + "step": 11494 + }, + { + "epoch": 1.4622821523979137, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5623587369918823, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8855834603309631, + "num_tokens": 438759280.0, + "step": 11495 + }, + { + "epoch": 1.4624093626765042, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.608731746673584, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8488210439682007, + "num_tokens": 438799898.0, + "step": 11496 + }, + { + "epoch": 1.4625365729550948, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5841120481491089, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8570813536643982, + "num_tokens": 438842534.0, + "step": 11497 + }, + { + "epoch": 1.4626637832336853, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 3.322467088699341, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8423139452934265, + "num_tokens": 438873894.0, + "step": 11498 + }, + { + "epoch": 1.4627909935122758, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.683993935585022, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8789588809013367, + "num_tokens": 438909215.0, + "step": 11499 + }, + { + "epoch": 1.4629182037908663, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7204632759094238, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.861886739730835, + "num_tokens": 438944741.0, + "step": 11500 + }, + { + "epoch": 1.4630454140694569, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5757789611816406, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8536274433135986, + "num_tokens": 438984097.0, + "step": 11501 + }, + { + "epoch": 1.4631726243480474, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5536417961120605, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.875950276851654, + "num_tokens": 439021945.0, + "step": 11502 + }, + { + "epoch": 1.4632998346266377, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5902605056762695, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8614761829376221, + "num_tokens": 439064641.0, + "step": 11503 + }, + { + "epoch": 1.4634270449052282, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7018635272979736, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8824871182441711, + "num_tokens": 439103571.0, + "step": 11504 + }, + { + "epoch": 1.4635542551838188, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7283381223678589, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8854004144668579, + "num_tokens": 439134995.0, + "step": 11505 + }, + { + "epoch": 1.4636814654624093, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4468796253204346, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8787087202072144, + "num_tokens": 439177189.0, + "step": 11506 + }, + { + "epoch": 1.4638086757409998, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5500872135162354, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8664146661758423, + "num_tokens": 439220797.0, + "step": 11507 + }, + { + "epoch": 1.4639358860195903, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.559535264968872, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8503263592720032, + "num_tokens": 439265716.0, + "step": 11508 + }, + { + "epoch": 1.4640630962981809, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5997158288955688, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8689419627189636, + "num_tokens": 439305495.0, + "step": 11509 + }, + { + "epoch": 1.4641903065767714, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5254415273666382, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8913995027542114, + "num_tokens": 439344088.0, + "step": 11510 + }, + { + "epoch": 1.464317516855362, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6387203931808472, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8668680191040039, + "num_tokens": 439381024.0, + "step": 11511 + }, + { + "epoch": 1.4644447271339525, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5709642171859741, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8752506971359253, + "num_tokens": 439418788.0, + "step": 11512 + }, + { + "epoch": 1.464571937412543, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6831310987472534, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8625016212463379, + "num_tokens": 439456742.0, + "step": 11513 + }, + { + "epoch": 1.4646991476911335, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.614848017692566, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8698903918266296, + "num_tokens": 439498078.0, + "step": 11514 + }, + { + "epoch": 1.464826357969724, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4750330448150635, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8860535621643066, + "num_tokens": 439537278.0, + "step": 11515 + }, + { + "epoch": 1.4649535682483146, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6316598653793335, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.88022780418396, + "num_tokens": 439571323.0, + "step": 11516 + }, + { + "epoch": 1.465080778526905, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.597495198249817, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8714399337768555, + "num_tokens": 439610918.0, + "step": 11517 + }, + { + "epoch": 1.4652079888054954, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5433956384658813, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8653446435928345, + "num_tokens": 439657402.0, + "step": 11518 + }, + { + "epoch": 1.465335199084086, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7970271110534668, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8818581104278564, + "num_tokens": 439691935.0, + "step": 11519 + }, + { + "epoch": 1.4654624093626765, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6394320726394653, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8760244846343994, + "num_tokens": 439729606.0, + "step": 11520 + }, + { + "epoch": 1.465589619641267, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5377099514007568, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8573489189147949, + "num_tokens": 439771411.0, + "step": 11521 + }, + { + "epoch": 1.4657168299198575, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5592589378356934, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8793843984603882, + "num_tokens": 439807887.0, + "step": 11522 + }, + { + "epoch": 1.465844040198448, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5634685754776, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8697723150253296, + "num_tokens": 439847825.0, + "step": 11523 + }, + { + "epoch": 1.4659712504770386, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7839066982269287, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8728592395782471, + "num_tokens": 439885123.0, + "step": 11524 + }, + { + "epoch": 1.466098460755629, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7409292459487915, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8573787212371826, + "num_tokens": 439920899.0, + "step": 11525 + }, + { + "epoch": 1.4662256710342196, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6794708967208862, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8833702802658081, + "num_tokens": 439951464.0, + "step": 11526 + }, + { + "epoch": 1.46635288131281, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5985466241836548, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8702260255813599, + "num_tokens": 439993230.0, + "step": 11527 + }, + { + "epoch": 1.4664800915914005, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.682218074798584, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8776257038116455, + "num_tokens": 440032402.0, + "step": 11528 + }, + { + "epoch": 1.466607301869991, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5671862363815308, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8750530481338501, + "num_tokens": 440072692.0, + "step": 11529 + }, + { + "epoch": 1.4667345121485815, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4685354232788086, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8754632472991943, + "num_tokens": 440116142.0, + "step": 11530 + }, + { + "epoch": 1.466861722427172, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4128996133804321, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8908305764198303, + "num_tokens": 440159570.0, + "step": 11531 + }, + { + "epoch": 1.4669889327057626, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5933046340942383, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8675701022148132, + "num_tokens": 440198226.0, + "step": 11532 + }, + { + "epoch": 1.467116142984353, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4309844970703125, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8823410868644714, + "num_tokens": 440243338.0, + "step": 11533 + }, + { + "epoch": 1.4672433532629436, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7198607921600342, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8548758029937744, + "num_tokens": 440282082.0, + "step": 11534 + }, + { + "epoch": 1.4673705635415342, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6771833896636963, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8640109300613403, + "num_tokens": 440320384.0, + "step": 11535 + }, + { + "epoch": 1.4674977738201247, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6619092226028442, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8486472368240356, + "num_tokens": 440360136.0, + "step": 11536 + }, + { + "epoch": 1.4676249840987152, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.662566900253296, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8692061901092529, + "num_tokens": 440398164.0, + "step": 11537 + }, + { + "epoch": 1.4677521943773058, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7084300518035889, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8485383987426758, + "num_tokens": 440432776.0, + "step": 11538 + }, + { + "epoch": 1.4678794046558963, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6376041173934937, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8732431530952454, + "num_tokens": 440467710.0, + "step": 11539 + }, + { + "epoch": 1.4680066149344868, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.570007562637329, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8713002800941467, + "num_tokens": 440508408.0, + "step": 11540 + }, + { + "epoch": 1.4681338252130773, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4382621049880981, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8916950821876526, + "num_tokens": 440550281.0, + "step": 11541 + }, + { + "epoch": 1.4682610354916679, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5910292863845825, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8630834221839905, + "num_tokens": 440590172.0, + "step": 11542 + }, + { + "epoch": 1.4683882457702582, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5114089250564575, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8784807920455933, + "num_tokens": 440628417.0, + "step": 11543 + }, + { + "epoch": 1.4685154560488487, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5455549955368042, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8813134431838989, + "num_tokens": 440666042.0, + "step": 11544 + }, + { + "epoch": 1.4686426663274392, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6052104234695435, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8530677556991577, + "num_tokens": 440706855.0, + "step": 11545 + }, + { + "epoch": 1.4687698766060298, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.53743314743042, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8810412883758545, + "num_tokens": 440747811.0, + "step": 11546 + }, + { + "epoch": 1.4688970868846203, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5269228219985962, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8759865164756775, + "num_tokens": 440786631.0, + "step": 11547 + }, + { + "epoch": 1.4690242971632108, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.9319944381713867, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8655046224594116, + "num_tokens": 440823065.0, + "step": 11548 + }, + { + "epoch": 1.4691515074418013, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.735700011253357, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8611027598381042, + "num_tokens": 440857763.0, + "step": 11549 + }, + { + "epoch": 1.4692787177203919, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8720968961715698, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8738527297973633, + "num_tokens": 440898479.0, + "step": 11550 + }, + { + "epoch": 1.4694059279989824, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.9547468423843384, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8642656207084656, + "num_tokens": 440929573.0, + "step": 11551 + }, + { + "epoch": 1.4695331382775727, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6158943176269531, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8891733884811401, + "num_tokens": 440968014.0, + "step": 11552 + }, + { + "epoch": 1.4696603485561632, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.765303611755371, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8547989726066589, + "num_tokens": 441005550.0, + "step": 11553 + }, + { + "epoch": 1.4697875588347538, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6310820579528809, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8793780207633972, + "num_tokens": 441041148.0, + "step": 11554 + }, + { + "epoch": 1.4699147691133443, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4503240585327148, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8732725977897644, + "num_tokens": 441087421.0, + "step": 11555 + }, + { + "epoch": 1.4700419793919348, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4863728284835815, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8776684999465942, + "num_tokens": 441129667.0, + "step": 11556 + }, + { + "epoch": 1.4701691896705253, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6385986804962158, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8784995079040527, + "num_tokens": 441163348.0, + "step": 11557 + }, + { + "epoch": 1.4702963999491159, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6044445037841797, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8804621696472168, + "num_tokens": 441197981.0, + "step": 11558 + }, + { + "epoch": 1.4704236102277064, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5740246772766113, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8768215775489807, + "num_tokens": 441235951.0, + "step": 11559 + }, + { + "epoch": 1.470550820506297, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.688772439956665, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8582886457443237, + "num_tokens": 441277307.0, + "step": 11560 + }, + { + "epoch": 1.4706780307848875, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5964975357055664, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8602550029754639, + "num_tokens": 441316042.0, + "step": 11561 + }, + { + "epoch": 1.470805241063478, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.586240291595459, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8900063037872314, + "num_tokens": 441353809.0, + "step": 11562 + }, + { + "epoch": 1.4709324513420685, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6230939626693726, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8719476461410522, + "num_tokens": 441392223.0, + "step": 11563 + }, + { + "epoch": 1.471059661620659, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.579179286956787, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8674451112747192, + "num_tokens": 441435160.0, + "step": 11564 + }, + { + "epoch": 1.4711868718992496, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.548416018486023, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8677390813827515, + "num_tokens": 441474183.0, + "step": 11565 + }, + { + "epoch": 1.47131408217784, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.748781442642212, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8614469766616821, + "num_tokens": 441507547.0, + "step": 11566 + }, + { + "epoch": 1.4714412924564304, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.733162760734558, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8751165866851807, + "num_tokens": 441540670.0, + "step": 11567 + }, + { + "epoch": 1.471568502735021, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6915013790130615, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8701149225234985, + "num_tokens": 441578534.0, + "step": 11568 + }, + { + "epoch": 1.4716957130136115, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6660021543502808, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8784778118133545, + "num_tokens": 441613233.0, + "step": 11569 + }, + { + "epoch": 1.471822923292202, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6785848140716553, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.878467857837677, + "num_tokens": 441647157.0, + "step": 11570 + }, + { + "epoch": 1.4719501335707925, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.517487645149231, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8678797483444214, + "num_tokens": 441690563.0, + "step": 11571 + }, + { + "epoch": 1.472077343849383, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6824226379394531, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8753960132598877, + "num_tokens": 441728367.0, + "step": 11572 + }, + { + "epoch": 1.4722045541279736, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8077824115753174, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8662621974945068, + "num_tokens": 441763068.0, + "step": 11573 + }, + { + "epoch": 1.472331764406564, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5758180618286133, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.864270031452179, + "num_tokens": 441805971.0, + "step": 11574 + }, + { + "epoch": 1.4724589746851546, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6812621355056763, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.868388295173645, + "num_tokens": 441843453.0, + "step": 11575 + }, + { + "epoch": 1.472586184963745, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.715011715888977, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8628618717193604, + "num_tokens": 441877123.0, + "step": 11576 + }, + { + "epoch": 1.4727133952423355, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7300664186477661, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8679163455963135, + "num_tokens": 441912460.0, + "step": 11577 + }, + { + "epoch": 1.472840605520926, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.434984564781189, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.880860447883606, + "num_tokens": 441954202.0, + "step": 11578 + }, + { + "epoch": 1.4729678157995165, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4996085166931152, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8884948492050171, + "num_tokens": 441994553.0, + "step": 11579 + }, + { + "epoch": 1.473095026078107, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6212232112884521, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8648303747177124, + "num_tokens": 442033031.0, + "step": 11580 + }, + { + "epoch": 1.4732222363566976, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8473329544067383, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.851362407207489, + "num_tokens": 442069496.0, + "step": 11581 + }, + { + "epoch": 1.473349446635288, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7612172365188599, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8643814325332642, + "num_tokens": 442104124.0, + "step": 11582 + }, + { + "epoch": 1.4734766569138786, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6584914922714233, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8649248480796814, + "num_tokens": 442145250.0, + "step": 11583 + }, + { + "epoch": 1.4736038671924692, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6834583282470703, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8594709634780884, + "num_tokens": 442180936.0, + "step": 11584 + }, + { + "epoch": 1.4737310774710597, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.388802409172058, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8830190896987915, + "num_tokens": 442228454.0, + "step": 11585 + }, + { + "epoch": 1.4738582877496502, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.66753089427948, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8775936365127563, + "num_tokens": 442263058.0, + "step": 11586 + }, + { + "epoch": 1.4739854980282407, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5838124752044678, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8720421195030212, + "num_tokens": 442304803.0, + "step": 11587 + }, + { + "epoch": 1.4741127083068313, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7084215879440308, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8705568313598633, + "num_tokens": 442338658.0, + "step": 11588 + }, + { + "epoch": 1.4742399185854218, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.644156813621521, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8609077334403992, + "num_tokens": 442377388.0, + "step": 11589 + }, + { + "epoch": 1.4743671288640123, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8032604455947876, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8652514219284058, + "num_tokens": 442414138.0, + "step": 11590 + }, + { + "epoch": 1.4744943391426026, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6091705560684204, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8805254101753235, + "num_tokens": 442447568.0, + "step": 11591 + }, + { + "epoch": 1.4746215494211932, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4902106523513794, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8788979053497314, + "num_tokens": 442490840.0, + "step": 11592 + }, + { + "epoch": 1.4747487596997837, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.473920226097107, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8694881200790405, + "num_tokens": 442534998.0, + "step": 11593 + }, + { + "epoch": 1.4748759699783742, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7337690591812134, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8536060452461243, + "num_tokens": 442572838.0, + "step": 11594 + }, + { + "epoch": 1.4750031802569648, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6721100807189941, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8623706102371216, + "num_tokens": 442606851.0, + "step": 11595 + }, + { + "epoch": 1.4751303905355553, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6261506080627441, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8735131025314331, + "num_tokens": 442643519.0, + "step": 11596 + }, + { + "epoch": 1.4752576008141458, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.691412091255188, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8832016587257385, + "num_tokens": 442674573.0, + "step": 11597 + }, + { + "epoch": 1.4753848110927363, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.552775502204895, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8837190866470337, + "num_tokens": 442715179.0, + "step": 11598 + }, + { + "epoch": 1.4755120213713269, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6037847995758057, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8640508055686951, + "num_tokens": 442752956.0, + "step": 11599 + }, + { + "epoch": 1.4756392316499174, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6541123390197754, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8712419867515564, + "num_tokens": 442790288.0, + "step": 11600 + }, + { + "epoch": 1.4757664419285077, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.656514286994934, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.860262393951416, + "num_tokens": 442827040.0, + "step": 11601 + }, + { + "epoch": 1.4758936522070982, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6321415901184082, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8489859104156494, + "num_tokens": 442870597.0, + "step": 11602 + }, + { + "epoch": 1.4760208624856888, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7359994649887085, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8665862679481506, + "num_tokens": 442908015.0, + "step": 11603 + }, + { + "epoch": 1.4761480727642793, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5828430652618408, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8730018138885498, + "num_tokens": 442946680.0, + "step": 11604 + }, + { + "epoch": 1.4762752830428698, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6112571954727173, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8674491047859192, + "num_tokens": 442986640.0, + "step": 11605 + }, + { + "epoch": 1.4764024933214603, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6344558000564575, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8766001462936401, + "num_tokens": 443024552.0, + "step": 11606 + }, + { + "epoch": 1.4765297036000509, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6072851419448853, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8792260885238647, + "num_tokens": 443063812.0, + "step": 11607 + }, + { + "epoch": 1.4766569138786414, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7067440748214722, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8810276985168457, + "num_tokens": 443097586.0, + "step": 11608 + }, + { + "epoch": 1.476784124157232, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5155975818634033, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8600407838821411, + "num_tokens": 443143530.0, + "step": 11609 + }, + { + "epoch": 1.4769113344358225, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6190077066421509, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.863379180431366, + "num_tokens": 443182279.0, + "step": 11610 + }, + { + "epoch": 1.477038544714413, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4524329900741577, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8731609582901001, + "num_tokens": 443225851.0, + "step": 11611 + }, + { + "epoch": 1.4771657549930035, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.637113332748413, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8861559629440308, + "num_tokens": 443258781.0, + "step": 11612 + }, + { + "epoch": 1.477292965271594, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6865513324737549, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8752987384796143, + "num_tokens": 443290731.0, + "step": 11613 + }, + { + "epoch": 1.4774201755501846, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6847114562988281, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8713455200195312, + "num_tokens": 443331835.0, + "step": 11614 + }, + { + "epoch": 1.477547385828775, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6228818893432617, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8764100074768066, + "num_tokens": 443370349.0, + "step": 11615 + }, + { + "epoch": 1.4776745961073654, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.684280514717102, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8682248592376709, + "num_tokens": 443407515.0, + "step": 11616 + }, + { + "epoch": 1.477801806385956, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.829872965812683, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8845864534378052, + "num_tokens": 443445296.0, + "step": 11617 + }, + { + "epoch": 1.4779290166645465, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6553747653961182, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8737340569496155, + "num_tokens": 443483615.0, + "step": 11618 + }, + { + "epoch": 1.478056226943137, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6485072374343872, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8650703430175781, + "num_tokens": 443519522.0, + "step": 11619 + }, + { + "epoch": 1.4781834372217275, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.549811840057373, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8726235032081604, + "num_tokens": 443560950.0, + "step": 11620 + }, + { + "epoch": 1.478310647500318, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5058300495147705, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8770979642868042, + "num_tokens": 443600208.0, + "step": 11621 + }, + { + "epoch": 1.4784378577789086, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.698973298072815, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8670654892921448, + "num_tokens": 443637655.0, + "step": 11622 + }, + { + "epoch": 1.478565068057499, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7354576587677002, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8750098347663879, + "num_tokens": 443667883.0, + "step": 11623 + }, + { + "epoch": 1.4786922783360896, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.695149302482605, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8543087244033813, + "num_tokens": 443708370.0, + "step": 11624 + }, + { + "epoch": 1.47881948861468, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7083216905593872, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8783091306686401, + "num_tokens": 443742352.0, + "step": 11625 + }, + { + "epoch": 1.4789466988932705, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7411751747131348, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8638860583305359, + "num_tokens": 443779096.0, + "step": 11626 + }, + { + "epoch": 1.479073909171861, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5847346782684326, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8840974569320679, + "num_tokens": 443815426.0, + "step": 11627 + }, + { + "epoch": 1.4792011194504515, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6200010776519775, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8702788352966309, + "num_tokens": 443854061.0, + "step": 11628 + }, + { + "epoch": 1.479328329729042, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5940115451812744, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8732904195785522, + "num_tokens": 443889148.0, + "step": 11629 + }, + { + "epoch": 1.4794555400076326, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5753997564315796, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8712079524993896, + "num_tokens": 443926189.0, + "step": 11630 + }, + { + "epoch": 1.479582750286223, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5677348375320435, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8627179861068726, + "num_tokens": 443965064.0, + "step": 11631 + }, + { + "epoch": 1.4797099605648136, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 6.882700443267822, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8679217100143433, + "num_tokens": 443998800.0, + "step": 11632 + }, + { + "epoch": 1.4798371708434042, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6063934564590454, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8723161220550537, + "num_tokens": 444045461.0, + "step": 11633 + }, + { + "epoch": 1.4799643811219947, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5780731439590454, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8839985728263855, + "num_tokens": 444085520.0, + "step": 11634 + }, + { + "epoch": 1.4800915914005852, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.728535532951355, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8811543583869934, + "num_tokens": 444119018.0, + "step": 11635 + }, + { + "epoch": 1.4802188016791757, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 2.147555351257324, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8635425567626953, + "num_tokens": 444164873.0, + "step": 11636 + }, + { + "epoch": 1.4803460119577663, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6961615085601807, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.853503406047821, + "num_tokens": 444203570.0, + "step": 11637 + }, + { + "epoch": 1.4804732222363568, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5006417036056519, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8502820730209351, + "num_tokens": 444250069.0, + "step": 11638 + }, + { + "epoch": 1.4806004325149473, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.605858564376831, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8603354692459106, + "num_tokens": 444291396.0, + "step": 11639 + }, + { + "epoch": 1.4807276427935376, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5666145086288452, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8721640110015869, + "num_tokens": 444327923.0, + "step": 11640 + }, + { + "epoch": 1.4808548530721282, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5344098806381226, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8537628054618835, + "num_tokens": 444371637.0, + "step": 11641 + }, + { + "epoch": 1.4809820633507187, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.702008605003357, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.877905547618866, + "num_tokens": 444407084.0, + "step": 11642 + }, + { + "epoch": 1.4811092736293092, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6131374835968018, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8772752285003662, + "num_tokens": 444442286.0, + "step": 11643 + }, + { + "epoch": 1.4812364839078997, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.679967999458313, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8556532859802246, + "num_tokens": 444477838.0, + "step": 11644 + }, + { + "epoch": 1.4813636941864903, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7075676918029785, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8737830519676208, + "num_tokens": 444517393.0, + "step": 11645 + }, + { + "epoch": 1.4814909044650808, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7333459854125977, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8709553480148315, + "num_tokens": 444554636.0, + "step": 11646 + }, + { + "epoch": 1.4816181147436713, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.638919472694397, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8676068186759949, + "num_tokens": 444592739.0, + "step": 11647 + }, + { + "epoch": 1.4817453250222619, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6878814697265625, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8718103170394897, + "num_tokens": 444627879.0, + "step": 11648 + }, + { + "epoch": 1.4818725353008524, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7154709100723267, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8662017583847046, + "num_tokens": 444660924.0, + "step": 11649 + }, + { + "epoch": 1.4819997455794427, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5685144662857056, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8712258338928223, + "num_tokens": 444698846.0, + "step": 11650 + }, + { + "epoch": 1.4821269558580332, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6664340496063232, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8818948864936829, + "num_tokens": 444731635.0, + "step": 11651 + }, + { + "epoch": 1.4822541661366238, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 2.3114824295043945, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8652655482292175, + "num_tokens": 444766809.0, + "step": 11652 + }, + { + "epoch": 1.4823813764152143, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6130268573760986, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8758971691131592, + "num_tokens": 444801972.0, + "step": 11653 + }, + { + "epoch": 1.4825085866938048, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7110363245010376, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8699071407318115, + "num_tokens": 444837214.0, + "step": 11654 + }, + { + "epoch": 1.4826357969723953, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7043622732162476, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8725717067718506, + "num_tokens": 444870847.0, + "step": 11655 + }, + { + "epoch": 1.4827630072509859, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6342127323150635, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8822715282440186, + "num_tokens": 444907528.0, + "step": 11656 + }, + { + "epoch": 1.4828902175295764, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6183451414108276, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8749701976776123, + "num_tokens": 444942394.0, + "step": 11657 + }, + { + "epoch": 1.483017427808167, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6556099653244019, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8653750419616699, + "num_tokens": 444975356.0, + "step": 11658 + }, + { + "epoch": 1.4831446380867574, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5666497945785522, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.882170557975769, + "num_tokens": 445014059.0, + "step": 11659 + }, + { + "epoch": 1.483271848365348, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5301778316497803, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8783338665962219, + "num_tokens": 445052303.0, + "step": 11660 + }, + { + "epoch": 1.4833990586439385, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6778010129928589, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8617383241653442, + "num_tokens": 445089099.0, + "step": 11661 + }, + { + "epoch": 1.483526268922529, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6707497835159302, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8601187467575073, + "num_tokens": 445124574.0, + "step": 11662 + }, + { + "epoch": 1.4836534792011196, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.668412208557129, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8633521795272827, + "num_tokens": 445160897.0, + "step": 11663 + }, + { + "epoch": 1.48378068947971, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6411114931106567, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8683711290359497, + "num_tokens": 445195620.0, + "step": 11664 + }, + { + "epoch": 1.4839078997583004, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.423378586769104, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8753273487091064, + "num_tokens": 445240999.0, + "step": 11665 + }, + { + "epoch": 1.484035110036891, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4152523279190063, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8865073919296265, + "num_tokens": 445285278.0, + "step": 11666 + }, + { + "epoch": 1.4841623203154815, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.495471477508545, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8809996247291565, + "num_tokens": 445327966.0, + "step": 11667 + }, + { + "epoch": 1.484289530594072, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5669419765472412, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8733987212181091, + "num_tokens": 445365697.0, + "step": 11668 + }, + { + "epoch": 1.4844167408726625, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6824086904525757, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8723760843276978, + "num_tokens": 445400751.0, + "step": 11669 + }, + { + "epoch": 1.484543951151253, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5815010070800781, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.866497278213501, + "num_tokens": 445441568.0, + "step": 11670 + }, + { + "epoch": 1.4846711614298436, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5414295196533203, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8592740893363953, + "num_tokens": 445483941.0, + "step": 11671 + }, + { + "epoch": 1.484798371708434, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5928398370742798, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8687791228294373, + "num_tokens": 445523130.0, + "step": 11672 + }, + { + "epoch": 1.4849255819870246, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.655643343925476, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8528724312782288, + "num_tokens": 445563622.0, + "step": 11673 + }, + { + "epoch": 1.485052792265615, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7127861976623535, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8599985241889954, + "num_tokens": 445600098.0, + "step": 11674 + }, + { + "epoch": 1.4851800025442055, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6610898971557617, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8605027198791504, + "num_tokens": 445636772.0, + "step": 11675 + }, + { + "epoch": 1.485307212822796, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7077566385269165, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8533422350883484, + "num_tokens": 445672990.0, + "step": 11676 + }, + { + "epoch": 1.4854344231013865, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7346059083938599, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8697109818458557, + "num_tokens": 445706440.0, + "step": 11677 + }, + { + "epoch": 1.485561633379977, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6325773000717163, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8727373480796814, + "num_tokens": 445743831.0, + "step": 11678 + }, + { + "epoch": 1.4856888436585676, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6975210905075073, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8574912548065186, + "num_tokens": 445778682.0, + "step": 11679 + }, + { + "epoch": 1.485816053937158, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6637810468673706, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8722449541091919, + "num_tokens": 445815607.0, + "step": 11680 + }, + { + "epoch": 1.4859432642157486, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7255603075027466, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8679333329200745, + "num_tokens": 445850598.0, + "step": 11681 + }, + { + "epoch": 1.4860704744943392, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.560793399810791, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8817000985145569, + "num_tokens": 445896095.0, + "step": 11682 + }, + { + "epoch": 1.4861976847729297, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5216718912124634, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8851208686828613, + "num_tokens": 445941344.0, + "step": 11683 + }, + { + "epoch": 1.4863248950515202, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6878234148025513, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8774311542510986, + "num_tokens": 445975937.0, + "step": 11684 + }, + { + "epoch": 1.4864521053301107, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5023205280303955, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8871229290962219, + "num_tokens": 446016791.0, + "step": 11685 + }, + { + "epoch": 1.4865793156087013, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5207405090332031, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.871227502822876, + "num_tokens": 446059248.0, + "step": 11686 + }, + { + "epoch": 1.4867065258872918, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8062092065811157, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8539910912513733, + "num_tokens": 446100022.0, + "step": 11687 + }, + { + "epoch": 1.4868337361658823, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.706192135810852, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8620703816413879, + "num_tokens": 446135920.0, + "step": 11688 + }, + { + "epoch": 1.4869609464444726, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5393013954162598, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8782746195793152, + "num_tokens": 446173568.0, + "step": 11689 + }, + { + "epoch": 1.4870881567230632, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4640998840332031, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8799241781234741, + "num_tokens": 446217708.0, + "step": 11690 + }, + { + "epoch": 1.4872153670016537, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7332006692886353, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8602063655853271, + "num_tokens": 446252205.0, + "step": 11691 + }, + { + "epoch": 1.4873425772802442, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7050209045410156, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8685417771339417, + "num_tokens": 446286312.0, + "step": 11692 + }, + { + "epoch": 1.4874697875588347, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5662590265274048, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8777543306350708, + "num_tokens": 446324099.0, + "step": 11693 + }, + { + "epoch": 1.4875969978374253, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4963397979736328, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8728457093238831, + "num_tokens": 446366277.0, + "step": 11694 + }, + { + "epoch": 1.4877242081160158, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5186558961868286, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8769276142120361, + "num_tokens": 446406070.0, + "step": 11695 + }, + { + "epoch": 1.4878514183946063, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7943309545516968, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8589670658111572, + "num_tokens": 446439181.0, + "step": 11696 + }, + { + "epoch": 1.4879786286731969, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5540440082550049, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.865687370300293, + "num_tokens": 446475999.0, + "step": 11697 + }, + { + "epoch": 1.4881058389517874, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.548092007637024, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8781828880310059, + "num_tokens": 446515254.0, + "step": 11698 + }, + { + "epoch": 1.4882330492303777, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5660383701324463, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8718175888061523, + "num_tokens": 446552996.0, + "step": 11699 + }, + { + "epoch": 1.4883602595089682, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4645367860794067, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8829689621925354, + "num_tokens": 446594707.0, + "step": 11700 + }, + { + "epoch": 1.4884874697875587, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5672392845153809, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8665817975997925, + "num_tokens": 446633032.0, + "step": 11701 + }, + { + "epoch": 1.4886146800661493, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5220340490341187, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8574172258377075, + "num_tokens": 446673475.0, + "step": 11702 + }, + { + "epoch": 1.4887418903447398, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.742236614227295, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8785061836242676, + "num_tokens": 446705336.0, + "step": 11703 + }, + { + "epoch": 1.4888691006233303, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6365371942520142, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8692259192466736, + "num_tokens": 446741423.0, + "step": 11704 + }, + { + "epoch": 1.4889963109019209, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6221848726272583, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8676120042800903, + "num_tokens": 446777159.0, + "step": 11705 + }, + { + "epoch": 1.4891235211805114, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5808937549591064, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8591547608375549, + "num_tokens": 446820583.0, + "step": 11706 + }, + { + "epoch": 1.489250731459102, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5789111852645874, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8820853233337402, + "num_tokens": 446863216.0, + "step": 11707 + }, + { + "epoch": 1.4893779417376924, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5934290885925293, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8743936419487, + "num_tokens": 446900224.0, + "step": 11708 + }, + { + "epoch": 1.489505152016283, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7043588161468506, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8749095797538757, + "num_tokens": 446933144.0, + "step": 11709 + }, + { + "epoch": 1.4896323622948735, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6027063131332397, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8736511468887329, + "num_tokens": 446969732.0, + "step": 11710 + }, + { + "epoch": 1.489759572573464, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6061564683914185, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8651958703994751, + "num_tokens": 447011064.0, + "step": 11711 + }, + { + "epoch": 1.4898867828520546, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7688032388687134, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8642086982727051, + "num_tokens": 447050877.0, + "step": 11712 + }, + { + "epoch": 1.490013993130645, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4569611549377441, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8807987570762634, + "num_tokens": 447089854.0, + "step": 11713 + }, + { + "epoch": 1.4901412034092354, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5930845737457275, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8706986904144287, + "num_tokens": 447126290.0, + "step": 11714 + }, + { + "epoch": 1.490268413687826, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5882543325424194, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8705681562423706, + "num_tokens": 447161893.0, + "step": 11715 + }, + { + "epoch": 1.4903956239664164, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6710264682769775, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8522077798843384, + "num_tokens": 447199005.0, + "step": 11716 + }, + { + "epoch": 1.490522834245007, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6624579429626465, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.87778240442276, + "num_tokens": 447232799.0, + "step": 11717 + }, + { + "epoch": 1.4906500445235975, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.64445161819458, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8688028454780579, + "num_tokens": 447267177.0, + "step": 11718 + }, + { + "epoch": 1.490777254802188, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6824663877487183, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8456229567527771, + "num_tokens": 447308495.0, + "step": 11719 + }, + { + "epoch": 1.4909044650807786, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5429472923278809, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.872948169708252, + "num_tokens": 447349042.0, + "step": 11720 + }, + { + "epoch": 1.491031675359369, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5150806903839111, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8676985502243042, + "num_tokens": 447392195.0, + "step": 11721 + }, + { + "epoch": 1.4911588856379596, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5578203201293945, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8857837915420532, + "num_tokens": 447426791.0, + "step": 11722 + }, + { + "epoch": 1.49128609591655, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4923144578933716, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8776474595069885, + "num_tokens": 447466407.0, + "step": 11723 + }, + { + "epoch": 1.4914133061951405, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5001696348190308, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8627592921257019, + "num_tokens": 447511431.0, + "step": 11724 + }, + { + "epoch": 1.491540516473731, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6144760847091675, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8744933605194092, + "num_tokens": 447549171.0, + "step": 11725 + }, + { + "epoch": 1.4916677267523215, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4820114374160767, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8752196431159973, + "num_tokens": 447592731.0, + "step": 11726 + }, + { + "epoch": 1.491794937030912, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6042530536651611, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8701443672180176, + "num_tokens": 447633961.0, + "step": 11727 + }, + { + "epoch": 1.4919221473095026, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6868606805801392, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8566352128982544, + "num_tokens": 447673697.0, + "step": 11728 + }, + { + "epoch": 1.492049357588093, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.685697078704834, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8657246232032776, + "num_tokens": 447709460.0, + "step": 11729 + }, + { + "epoch": 1.4921765678666836, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5796210765838623, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8679304122924805, + "num_tokens": 447752975.0, + "step": 11730 + }, + { + "epoch": 1.4923037781452742, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6982357501983643, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8520631790161133, + "num_tokens": 447794379.0, + "step": 11731 + }, + { + "epoch": 1.4924309884238647, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7355436086654663, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8742179870605469, + "num_tokens": 447827034.0, + "step": 11732 + }, + { + "epoch": 1.4925581987024552, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7119568586349487, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8438793420791626, + "num_tokens": 447865881.0, + "step": 11733 + }, + { + "epoch": 1.4926854089810457, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.607452630996704, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8759241104125977, + "num_tokens": 447900988.0, + "step": 11734 + }, + { + "epoch": 1.4928126192596363, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5767433643341064, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8649896383285522, + "num_tokens": 447940361.0, + "step": 11735 + }, + { + "epoch": 1.4929398295382268, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5644149780273438, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8660781979560852, + "num_tokens": 447980511.0, + "step": 11736 + }, + { + "epoch": 1.4930670398168173, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6420198678970337, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8724880218505859, + "num_tokens": 448021303.0, + "step": 11737 + }, + { + "epoch": 1.4931942500954076, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.738931655883789, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8626543879508972, + "num_tokens": 448053579.0, + "step": 11738 + }, + { + "epoch": 1.4933214603739982, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6651607751846313, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8675908446311951, + "num_tokens": 448089952.0, + "step": 11739 + }, + { + "epoch": 1.4934486706525887, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5800331830978394, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.868529200553894, + "num_tokens": 448125790.0, + "step": 11740 + }, + { + "epoch": 1.4935758809311792, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.9340876340866089, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8639628887176514, + "num_tokens": 448156232.0, + "step": 11741 + }, + { + "epoch": 1.4937030912097697, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6173171997070312, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8779290914535522, + "num_tokens": 448191223.0, + "step": 11742 + }, + { + "epoch": 1.4938303014883603, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5510013103485107, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8763657212257385, + "num_tokens": 448231878.0, + "step": 11743 + }, + { + "epoch": 1.4939575117669508, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7004072666168213, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8624504804611206, + "num_tokens": 448267323.0, + "step": 11744 + }, + { + "epoch": 1.4940847220455413, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.572651982307434, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8688592314720154, + "num_tokens": 448303287.0, + "step": 11745 + }, + { + "epoch": 1.4942119323241319, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5450849533081055, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8661301732063293, + "num_tokens": 448344835.0, + "step": 11746 + }, + { + "epoch": 1.4943391426027224, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.596179723739624, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8799124360084534, + "num_tokens": 448382458.0, + "step": 11747 + }, + { + "epoch": 1.4944663528813127, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.629558801651001, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.875074028968811, + "num_tokens": 448421155.0, + "step": 11748 + }, + { + "epoch": 1.4945935631599032, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5833181142807007, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8689224720001221, + "num_tokens": 448457220.0, + "step": 11749 + }, + { + "epoch": 1.4947207734384937, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.564117431640625, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8710181713104248, + "num_tokens": 448495923.0, + "step": 11750 + }, + { + "epoch": 1.4948479837170843, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7001582384109497, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8908101916313171, + "num_tokens": 448527374.0, + "step": 11751 + }, + { + "epoch": 1.4949751939956748, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.514180302619934, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.870765209197998, + "num_tokens": 448570751.0, + "step": 11752 + }, + { + "epoch": 1.4951024042742653, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6180143356323242, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8487131595611572, + "num_tokens": 448610941.0, + "step": 11753 + }, + { + "epoch": 1.4952296145528559, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6750224828720093, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8678048849105835, + "num_tokens": 448643615.0, + "step": 11754 + }, + { + "epoch": 1.4953568248314464, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6327372789382935, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8582707643508911, + "num_tokens": 448681952.0, + "step": 11755 + }, + { + "epoch": 1.495484035110037, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6377052068710327, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8600096702575684, + "num_tokens": 448721867.0, + "step": 11756 + }, + { + "epoch": 1.4956112453886274, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6181459426879883, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.865496814250946, + "num_tokens": 448760691.0, + "step": 11757 + }, + { + "epoch": 1.495738455667218, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5584747791290283, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.860741138458252, + "num_tokens": 448803300.0, + "step": 11758 + }, + { + "epoch": 1.4958656659458085, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6617804765701294, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8683394193649292, + "num_tokens": 448844629.0, + "step": 11759 + }, + { + "epoch": 1.495992876224399, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6949495077133179, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8652552366256714, + "num_tokens": 448880710.0, + "step": 11760 + }, + { + "epoch": 1.4961200865029896, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8088195323944092, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8632509708404541, + "num_tokens": 448914939.0, + "step": 11761 + }, + { + "epoch": 1.49624729678158, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6144523620605469, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8645646572113037, + "num_tokens": 448955069.0, + "step": 11762 + }, + { + "epoch": 1.4963745070601704, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.653207778930664, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8676555156707764, + "num_tokens": 448993350.0, + "step": 11763 + }, + { + "epoch": 1.496501717338761, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6439343690872192, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8659287691116333, + "num_tokens": 449031509.0, + "step": 11764 + }, + { + "epoch": 1.4966289276173514, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6938279867172241, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8412762880325317, + "num_tokens": 449070226.0, + "step": 11765 + }, + { + "epoch": 1.496756137895942, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6177740097045898, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8737922310829163, + "num_tokens": 449109291.0, + "step": 11766 + }, + { + "epoch": 1.4968833481745325, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8327409029006958, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8650402426719666, + "num_tokens": 449141386.0, + "step": 11767 + }, + { + "epoch": 1.497010558453123, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6978974342346191, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8804184198379517, + "num_tokens": 449172815.0, + "step": 11768 + }, + { + "epoch": 1.4971377687317136, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5984755754470825, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8560153841972351, + "num_tokens": 449210748.0, + "step": 11769 + }, + { + "epoch": 1.497264979010304, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.535658359527588, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8666421175003052, + "num_tokens": 449251250.0, + "step": 11770 + }, + { + "epoch": 1.4973921892888946, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6064924001693726, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8692506551742554, + "num_tokens": 449285201.0, + "step": 11771 + }, + { + "epoch": 1.497519399567485, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6011021137237549, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8616988062858582, + "num_tokens": 449326948.0, + "step": 11772 + }, + { + "epoch": 1.4976466098460754, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6804219484329224, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8583367466926575, + "num_tokens": 449369898.0, + "step": 11773 + }, + { + "epoch": 1.497773820124666, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6485296487808228, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8676825761795044, + "num_tokens": 449406623.0, + "step": 11774 + }, + { + "epoch": 1.4979010304032565, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8102514743804932, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8566217422485352, + "num_tokens": 449441661.0, + "step": 11775 + }, + { + "epoch": 1.498028240681847, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.757253885269165, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8782819509506226, + "num_tokens": 449477821.0, + "step": 11776 + }, + { + "epoch": 1.4981554509604376, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7509294748306274, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8842973113059998, + "num_tokens": 449511769.0, + "step": 11777 + }, + { + "epoch": 1.498282661239028, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6810400485992432, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8670421838760376, + "num_tokens": 449548340.0, + "step": 11778 + }, + { + "epoch": 1.4984098715176186, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6536771059036255, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8616814017295837, + "num_tokens": 449584330.0, + "step": 11779 + }, + { + "epoch": 1.4985370817962091, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.528775691986084, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8724334239959717, + "num_tokens": 449626786.0, + "step": 11780 + }, + { + "epoch": 1.4986642920747997, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.729024887084961, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.883244514465332, + "num_tokens": 449660702.0, + "step": 11781 + }, + { + "epoch": 1.4987915023533902, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5938102006912231, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8622046709060669, + "num_tokens": 449699565.0, + "step": 11782 + }, + { + "epoch": 1.4989187126319807, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6883668899536133, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8666213750839233, + "num_tokens": 449733224.0, + "step": 11783 + }, + { + "epoch": 1.4990459229105713, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.589477777481079, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8769450187683105, + "num_tokens": 449767374.0, + "step": 11784 + }, + { + "epoch": 1.4991731331891618, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6005326509475708, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8690270185470581, + "num_tokens": 449805035.0, + "step": 11785 + }, + { + "epoch": 1.4993003434677523, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6402168273925781, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8697851896286011, + "num_tokens": 449843019.0, + "step": 11786 + }, + { + "epoch": 1.4994275537463426, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.562695860862732, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8747470378875732, + "num_tokens": 449879848.0, + "step": 11787 + }, + { + "epoch": 1.4995547640249332, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6553826332092285, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8656561970710754, + "num_tokens": 449921429.0, + "step": 11788 + }, + { + "epoch": 1.4996819743035237, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.785335898399353, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8687443733215332, + "num_tokens": 449952513.0, + "step": 11789 + }, + { + "epoch": 1.4998091845821142, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6378189325332642, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8819097280502319, + "num_tokens": 449987997.0, + "step": 11790 + }, + { + "epoch": 1.4999363948607047, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7084263563156128, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8669542074203491, + "num_tokens": 450029626.0, + "step": 11791 + }, + { + "epoch": 1.5000636051392953, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5175124406814575, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8755419850349426, + "num_tokens": 450072847.0, + "step": 11792 + }, + { + "epoch": 1.5001908154178858, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6374759674072266, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8677982091903687, + "num_tokens": 450111560.0, + "step": 11793 + }, + { + "epoch": 1.5003180256964763, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.480207085609436, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8740872740745544, + "num_tokens": 450154687.0, + "step": 11794 + }, + { + "epoch": 1.5004452359750666, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.660020351409912, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8646004796028137, + "num_tokens": 450195885.0, + "step": 11795 + }, + { + "epoch": 1.5005724462536572, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4472154378890991, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8810904026031494, + "num_tokens": 450236797.0, + "step": 11796 + }, + { + "epoch": 1.5006996565322477, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.646605372428894, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8553770780563354, + "num_tokens": 450275949.0, + "step": 11797 + }, + { + "epoch": 1.5008268668108382, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6634377241134644, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8812025785446167, + "num_tokens": 450310215.0, + "step": 11798 + }, + { + "epoch": 1.5009540770894287, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6268701553344727, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8413615822792053, + "num_tokens": 450354892.0, + "step": 11799 + }, + { + "epoch": 1.5010812873680193, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.491511583328247, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8750919103622437, + "num_tokens": 450397818.0, + "step": 11800 + }, + { + "epoch": 1.5012084976466098, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6104676723480225, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8750370740890503, + "num_tokens": 450432239.0, + "step": 11801 + }, + { + "epoch": 1.5013357079252003, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5962034463882446, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8776490092277527, + "num_tokens": 450472606.0, + "step": 11802 + }, + { + "epoch": 1.5014629182037909, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5857172012329102, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8569493293762207, + "num_tokens": 450513039.0, + "step": 11803 + }, + { + "epoch": 1.5015901284823814, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5830435752868652, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8644381761550903, + "num_tokens": 450551207.0, + "step": 11804 + }, + { + "epoch": 1.501717338760972, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.615679144859314, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.876265287399292, + "num_tokens": 450586095.0, + "step": 11805 + }, + { + "epoch": 1.5018445490395624, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.658815860748291, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8521397709846497, + "num_tokens": 450625500.0, + "step": 11806 + }, + { + "epoch": 1.501971759318153, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6218647956848145, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8766894936561584, + "num_tokens": 450664310.0, + "step": 11807 + }, + { + "epoch": 1.5020989695967435, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.433887243270874, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.87453293800354, + "num_tokens": 450708504.0, + "step": 11808 + }, + { + "epoch": 1.502226179875334, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4732378721237183, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8803889751434326, + "num_tokens": 450750452.0, + "step": 11809 + }, + { + "epoch": 1.5023533901539246, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6116135120391846, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8822530508041382, + "num_tokens": 450786052.0, + "step": 11810 + }, + { + "epoch": 1.502480600432515, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6929607391357422, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8645766973495483, + "num_tokens": 450822863.0, + "step": 11811 + }, + { + "epoch": 1.5026078107111056, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.586507797241211, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8814685940742493, + "num_tokens": 450855441.0, + "step": 11812 + }, + { + "epoch": 1.502735020989696, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7734122276306152, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8569977879524231, + "num_tokens": 450889617.0, + "step": 11813 + }, + { + "epoch": 1.5028622312682864, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6945654153823853, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8657658696174622, + "num_tokens": 450924216.0, + "step": 11814 + }, + { + "epoch": 1.502989441546877, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5947697162628174, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8780069351196289, + "num_tokens": 450961476.0, + "step": 11815 + }, + { + "epoch": 1.5031166518254675, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.622714877128601, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8710730671882629, + "num_tokens": 451000915.0, + "step": 11816 + }, + { + "epoch": 1.503243862104058, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6856831312179565, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8682687282562256, + "num_tokens": 451041037.0, + "step": 11817 + }, + { + "epoch": 1.5033710723826486, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5262298583984375, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8610484600067139, + "num_tokens": 451086215.0, + "step": 11818 + }, + { + "epoch": 1.503498282661239, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4780241250991821, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8793057203292847, + "num_tokens": 451124457.0, + "step": 11819 + }, + { + "epoch": 1.5036254929398294, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6947053670883179, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8655420541763306, + "num_tokens": 451164154.0, + "step": 11820 + }, + { + "epoch": 1.50375270321842, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6019032001495361, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8741472363471985, + "num_tokens": 451202033.0, + "step": 11821 + }, + { + "epoch": 1.5038799134970104, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6975785493850708, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8691944479942322, + "num_tokens": 451237480.0, + "step": 11822 + }, + { + "epoch": 1.504007123775601, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5996335744857788, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8863518238067627, + "num_tokens": 451277089.0, + "step": 11823 + }, + { + "epoch": 1.5041343340541915, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5227606296539307, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8648906946182251, + "num_tokens": 451318240.0, + "step": 11824 + }, + { + "epoch": 1.504261544332782, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6466294527053833, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8747638463973999, + "num_tokens": 451358847.0, + "step": 11825 + }, + { + "epoch": 1.5043887546113726, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6914701461791992, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8599273562431335, + "num_tokens": 451398292.0, + "step": 11826 + }, + { + "epoch": 1.504515964889963, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5664533376693726, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8611587285995483, + "num_tokens": 451439508.0, + "step": 11827 + }, + { + "epoch": 1.5046431751685536, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5563325881958008, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8753583431243896, + "num_tokens": 451477491.0, + "step": 11828 + }, + { + "epoch": 1.5047703854471441, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.642411708831787, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8611651659011841, + "num_tokens": 451514213.0, + "step": 11829 + }, + { + "epoch": 1.5048975957257347, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7143537998199463, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8658290505409241, + "num_tokens": 451548609.0, + "step": 11830 + }, + { + "epoch": 1.5050248060043252, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.514925479888916, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8790266513824463, + "num_tokens": 451587698.0, + "step": 11831 + }, + { + "epoch": 1.5051520162829157, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5446648597717285, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8699114322662354, + "num_tokens": 451628744.0, + "step": 11832 + }, + { + "epoch": 1.5052792265615063, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6512361764907837, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8672361373901367, + "num_tokens": 451663737.0, + "step": 11833 + }, + { + "epoch": 1.5054064368400968, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.612896203994751, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.870887279510498, + "num_tokens": 451698507.0, + "step": 11834 + }, + { + "epoch": 1.5055336471186873, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7133350372314453, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8516039848327637, + "num_tokens": 451733531.0, + "step": 11835 + }, + { + "epoch": 1.5056608573972778, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5654093027114868, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8754235506057739, + "num_tokens": 451770811.0, + "step": 11836 + }, + { + "epoch": 1.5057880676758684, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7942348718643188, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8600952625274658, + "num_tokens": 451800756.0, + "step": 11837 + }, + { + "epoch": 1.5059152779544587, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.718583345413208, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8786613345146179, + "num_tokens": 451837995.0, + "step": 11838 + }, + { + "epoch": 1.5060424882330492, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7062654495239258, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8495481610298157, + "num_tokens": 451878858.0, + "step": 11839 + }, + { + "epoch": 1.5061696985116397, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6698729991912842, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8775898814201355, + "num_tokens": 451913316.0, + "step": 11840 + }, + { + "epoch": 1.5062969087902303, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5942367315292358, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8776957988739014, + "num_tokens": 451950686.0, + "step": 11841 + }, + { + "epoch": 1.5064241190688208, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7337857484817505, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8708362579345703, + "num_tokens": 451983341.0, + "step": 11842 + }, + { + "epoch": 1.5065513293474113, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6401115655899048, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8625435829162598, + "num_tokens": 452024817.0, + "step": 11843 + }, + { + "epoch": 1.5066785396260016, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6368683576583862, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8647059202194214, + "num_tokens": 452062670.0, + "step": 11844 + }, + { + "epoch": 1.5068057499045922, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.510094404220581, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8726691007614136, + "num_tokens": 452105899.0, + "step": 11845 + }, + { + "epoch": 1.5069329601831827, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.562863826751709, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.865593671798706, + "num_tokens": 452146375.0, + "step": 11846 + }, + { + "epoch": 1.5070601704617732, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5383243560791016, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8731865882873535, + "num_tokens": 452187660.0, + "step": 11847 + }, + { + "epoch": 1.5071873807403637, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5316563844680786, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8544002175331116, + "num_tokens": 452232480.0, + "step": 11848 + }, + { + "epoch": 1.5073145910189543, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.536075234413147, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8722702860832214, + "num_tokens": 452273383.0, + "step": 11849 + }, + { + "epoch": 1.5074418012975448, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5969483852386475, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8722795248031616, + "num_tokens": 452311853.0, + "step": 11850 + }, + { + "epoch": 1.5075690115761353, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5501149892807007, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8713010549545288, + "num_tokens": 452353400.0, + "step": 11851 + }, + { + "epoch": 1.5076962218547258, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.691627860069275, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8500317335128784, + "num_tokens": 452389454.0, + "step": 11852 + }, + { + "epoch": 1.5078234321333164, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6839008331298828, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8574813604354858, + "num_tokens": 452424268.0, + "step": 11853 + }, + { + "epoch": 1.507950642411907, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5627304315567017, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8755653500556946, + "num_tokens": 452461475.0, + "step": 11854 + }, + { + "epoch": 1.5080778526904974, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6556649208068848, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8588936924934387, + "num_tokens": 452500842.0, + "step": 11855 + }, + { + "epoch": 1.508205062969088, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5386451482772827, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8684459328651428, + "num_tokens": 452542436.0, + "step": 11856 + }, + { + "epoch": 1.5083322732476785, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5680357217788696, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.868243396282196, + "num_tokens": 452582634.0, + "step": 11857 + }, + { + "epoch": 1.508459483526269, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7202136516571045, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8589539527893066, + "num_tokens": 452618940.0, + "step": 11858 + }, + { + "epoch": 1.5085866938048595, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6199758052825928, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8706767559051514, + "num_tokens": 452657407.0, + "step": 11859 + }, + { + "epoch": 1.50871390408345, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7491633892059326, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8537003993988037, + "num_tokens": 452692472.0, + "step": 11860 + }, + { + "epoch": 1.5088411143620406, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6198487281799316, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8808743357658386, + "num_tokens": 452725829.0, + "step": 11861 + }, + { + "epoch": 1.508968324640631, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6788214445114136, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8753030300140381, + "num_tokens": 452758315.0, + "step": 11862 + }, + { + "epoch": 1.5090955349192214, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6038020849227905, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8716546297073364, + "num_tokens": 452796047.0, + "step": 11863 + }, + { + "epoch": 1.509222745197812, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.597939372062683, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8651188611984253, + "num_tokens": 452835516.0, + "step": 11864 + }, + { + "epoch": 1.5093499554764025, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6669445037841797, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8842086791992188, + "num_tokens": 452869387.0, + "step": 11865 + }, + { + "epoch": 1.509477165754993, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5089547634124756, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8789714574813843, + "num_tokens": 452909289.0, + "step": 11866 + }, + { + "epoch": 1.5096043760335836, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5031577348709106, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8855310082435608, + "num_tokens": 452946886.0, + "step": 11867 + }, + { + "epoch": 1.509731586312174, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5963214635849, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8562434911727905, + "num_tokens": 452986571.0, + "step": 11868 + }, + { + "epoch": 1.5098587965907644, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.637547254562378, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8779953718185425, + "num_tokens": 453022267.0, + "step": 11869 + }, + { + "epoch": 1.509986006869355, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7915523052215576, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8748688697814941, + "num_tokens": 453051476.0, + "step": 11870 + }, + { + "epoch": 1.5101132171479454, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6212464570999146, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8557626605033875, + "num_tokens": 453090898.0, + "step": 11871 + }, + { + "epoch": 1.510240427426536, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6059459447860718, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8645015954971313, + "num_tokens": 453134785.0, + "step": 11872 + }, + { + "epoch": 1.5103676377051265, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7799992561340332, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.842792272567749, + "num_tokens": 453170311.0, + "step": 11873 + }, + { + "epoch": 1.510494847983717, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7253495454788208, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.872769832611084, + "num_tokens": 453209892.0, + "step": 11874 + }, + { + "epoch": 1.5106220582623076, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7137986421585083, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8718633651733398, + "num_tokens": 453244102.0, + "step": 11875 + }, + { + "epoch": 1.510749268540898, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6049063205718994, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8773849606513977, + "num_tokens": 453280299.0, + "step": 11876 + }, + { + "epoch": 1.5108764788194886, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.641129493713379, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8787456750869751, + "num_tokens": 453317117.0, + "step": 11877 + }, + { + "epoch": 1.5110036890980791, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5566740036010742, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8756916522979736, + "num_tokens": 453355346.0, + "step": 11878 + }, + { + "epoch": 1.5111308993766697, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.561560034751892, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8838845491409302, + "num_tokens": 453393006.0, + "step": 11879 + }, + { + "epoch": 1.5112581096552602, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6447007656097412, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8596655130386353, + "num_tokens": 453432835.0, + "step": 11880 + }, + { + "epoch": 1.5113853199338507, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.754869818687439, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8544750213623047, + "num_tokens": 453466788.0, + "step": 11881 + }, + { + "epoch": 1.5115125302124413, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5102803707122803, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8876538276672363, + "num_tokens": 453502708.0, + "step": 11882 + }, + { + "epoch": 1.5116397404910318, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4750715494155884, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8757990002632141, + "num_tokens": 453543985.0, + "step": 11883 + }, + { + "epoch": 1.5117669507696223, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5096477270126343, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8631128668785095, + "num_tokens": 453587534.0, + "step": 11884 + }, + { + "epoch": 1.5118941610482128, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.526215672492981, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8656587600708008, + "num_tokens": 453630111.0, + "step": 11885 + }, + { + "epoch": 1.5120213713268034, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6033190488815308, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8792870044708252, + "num_tokens": 453671303.0, + "step": 11886 + }, + { + "epoch": 1.5121485816053937, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6807681322097778, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.877290666103363, + "num_tokens": 453707478.0, + "step": 11887 + }, + { + "epoch": 1.5122757918839842, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5447279214859009, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8569952249526978, + "num_tokens": 453749681.0, + "step": 11888 + }, + { + "epoch": 1.5124030021625747, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.595613956451416, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8768845796585083, + "num_tokens": 453788309.0, + "step": 11889 + }, + { + "epoch": 1.5125302124411653, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.628851294517517, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8543585538864136, + "num_tokens": 453827832.0, + "step": 11890 + }, + { + "epoch": 1.5126574227197558, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6530115604400635, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8557366132736206, + "num_tokens": 453871853.0, + "step": 11891 + }, + { + "epoch": 1.5127846329983463, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7101500034332275, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8586715459823608, + "num_tokens": 453914087.0, + "step": 11892 + }, + { + "epoch": 1.5129118432769366, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.849067211151123, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8643184900283813, + "num_tokens": 453946080.0, + "step": 11893 + }, + { + "epoch": 1.5130390535555271, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6775473356246948, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8883910179138184, + "num_tokens": 453979890.0, + "step": 11894 + }, + { + "epoch": 1.5131662638341177, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4544199705123901, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8684694766998291, + "num_tokens": 454025184.0, + "step": 11895 + }, + { + "epoch": 1.5132934741127082, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6591684818267822, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8670876622200012, + "num_tokens": 454066741.0, + "step": 11896 + }, + { + "epoch": 1.5134206843912987, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.8241338729858398, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8641806840896606, + "num_tokens": 454101706.0, + "step": 11897 + }, + { + "epoch": 1.5135478946698893, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.668461561203003, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.873542308807373, + "num_tokens": 454140385.0, + "step": 11898 + }, + { + "epoch": 1.5136751049484798, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.610924482345581, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8713935613632202, + "num_tokens": 454177240.0, + "step": 11899 + }, + { + "epoch": 1.5138023152270703, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6493849754333496, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8536872267723083, + "num_tokens": 454217075.0, + "step": 11900 + }, + { + "epoch": 1.5139295255056608, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5429655313491821, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8718302249908447, + "num_tokens": 454255683.0, + "step": 11901 + }, + { + "epoch": 1.5140567357842514, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4202924966812134, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8851133584976196, + "num_tokens": 454296405.0, + "step": 11902 + }, + { + "epoch": 1.514183946062842, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5719876289367676, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8684857487678528, + "num_tokens": 454333670.0, + "step": 11903 + }, + { + "epoch": 1.5143111563414324, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5657095909118652, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8649212121963501, + "num_tokens": 454373181.0, + "step": 11904 + }, + { + "epoch": 1.514438366620023, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.528676986694336, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.873162031173706, + "num_tokens": 454414058.0, + "step": 11905 + }, + { + "epoch": 1.5145655768986135, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.4757740497589111, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8689380288124084, + "num_tokens": 454456320.0, + "step": 11906 + }, + { + "epoch": 1.514692787177204, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.634102702140808, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8660454750061035, + "num_tokens": 454493660.0, + "step": 11907 + }, + { + "epoch": 1.5148199974557945, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.627680778503418, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8831821084022522, + "num_tokens": 454528067.0, + "step": 11908 + }, + { + "epoch": 1.514947207734385, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6693373918533325, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8565576672554016, + "num_tokens": 454569220.0, + "step": 11909 + }, + { + "epoch": 1.5150744180129756, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.591982364654541, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8522492051124573, + "num_tokens": 454612030.0, + "step": 11910 + }, + { + "epoch": 1.515201628291566, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7768290042877197, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8577574491500854, + "num_tokens": 454648657.0, + "step": 11911 + }, + { + "epoch": 1.5153288385701564, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.670543909072876, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.867606520652771, + "num_tokens": 454686489.0, + "step": 11912 + }, + { + "epoch": 1.515456048848747, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.591873049736023, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8798683881759644, + "num_tokens": 454724187.0, + "step": 11913 + }, + { + "epoch": 1.5155832591273375, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6333168745040894, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8699524402618408, + "num_tokens": 454763241.0, + "step": 11914 + }, + { + "epoch": 1.515710469405928, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.540352463722229, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8730916380882263, + "num_tokens": 454805227.0, + "step": 11915 + }, + { + "epoch": 1.5158376796845185, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7060385942459106, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8572525978088379, + "num_tokens": 454841646.0, + "step": 11916 + }, + { + "epoch": 1.515964889963109, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6355262994766235, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8746813535690308, + "num_tokens": 454878285.0, + "step": 11917 + }, + { + "epoch": 1.5160921002416994, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5549027919769287, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8835558295249939, + "num_tokens": 454919560.0, + "step": 11918 + }, + { + "epoch": 1.51621931052029, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.637191653251648, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8774884939193726, + "num_tokens": 454957730.0, + "step": 11919 + }, + { + "epoch": 1.5163465207988804, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5365232229232788, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8678174018859863, + "num_tokens": 455000207.0, + "step": 11920 + }, + { + "epoch": 1.516473731077471, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5001426935195923, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8725292086601257, + "num_tokens": 455041909.0, + "step": 11921 + }, + { + "epoch": 1.5166009413560615, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7184690237045288, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8788450956344604, + "num_tokens": 455072113.0, + "step": 11922 + }, + { + "epoch": 1.516728151634652, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5926496982574463, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8735281229019165, + "num_tokens": 455111640.0, + "step": 11923 + }, + { + "epoch": 1.5168553619132426, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5449552536010742, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8759758472442627, + "num_tokens": 455154291.0, + "step": 11924 + }, + { + "epoch": 1.516982572191833, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.7837039232254028, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8685269951820374, + "num_tokens": 455184172.0, + "step": 11925 + }, + { + "epoch": 1.5171097824704236, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.547685146331787, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.848960280418396, + "num_tokens": 455230919.0, + "step": 11926 + }, + { + "epoch": 1.5172369927490141, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5093870162963867, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8714727163314819, + "num_tokens": 455275812.0, + "step": 11927 + }, + { + "epoch": 1.5173642030276047, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5900347232818604, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8682948350906372, + "num_tokens": 455313369.0, + "step": 11928 + }, + { + "epoch": 1.5174914133061952, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.5952057838439941, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8831099271774292, + "num_tokens": 455347879.0, + "step": 11929 + }, + { + "epoch": 1.5176186235847857, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.712156057357788, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8714358806610107, + "num_tokens": 455385647.0, + "step": 11930 + }, + { + "epoch": 1.5177458338633762, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6143122911453247, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8784482479095459, + "num_tokens": 455420617.0, + "step": 11931 + }, + { + "epoch": 1.5178730441419668, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5695346593856812, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8786311745643616, + "num_tokens": 455461327.0, + "step": 11932 + }, + { + "epoch": 1.5180002544205573, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.656962513923645, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8904980421066284, + "num_tokens": 455494255.0, + "step": 11933 + }, + { + "epoch": 1.5181274646991478, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5533050298690796, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8684325218200684, + "num_tokens": 455536216.0, + "step": 11934 + }, + { + "epoch": 1.5182546749777384, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5969457626342773, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8698981404304504, + "num_tokens": 455577689.0, + "step": 11935 + }, + { + "epoch": 1.5183818852563287, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6029545068740845, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8714141845703125, + "num_tokens": 455618587.0, + "step": 11936 + }, + { + "epoch": 1.5185090955349192, + "ewc_loss": 2.3365020751953125e-05, + "grad_norm": 1.6742373704910278, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8452789783477783, + "num_tokens": 455659360.0, + "step": 11937 + }, + { + "epoch": 1.5186363058135097, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.8454657793045044, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8576204776763916, + "num_tokens": 455690942.0, + "step": 11938 + }, + { + "epoch": 1.5187635160921003, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5666203498840332, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8787848949432373, + "num_tokens": 455735151.0, + "step": 11939 + }, + { + "epoch": 1.5188907263706908, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6579501628875732, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8682464361190796, + "num_tokens": 455771980.0, + "step": 11940 + }, + { + "epoch": 1.5190179366492813, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6458075046539307, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8583531379699707, + "num_tokens": 455810469.0, + "step": 11941 + }, + { + "epoch": 1.5191451469278716, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.74065363407135, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8826565742492676, + "num_tokens": 455846148.0, + "step": 11942 + }, + { + "epoch": 1.5192723572064621, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6516693830490112, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8749841451644897, + "num_tokens": 455882202.0, + "step": 11943 + }, + { + "epoch": 1.5193995674850527, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.66237211227417, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8756896257400513, + "num_tokens": 455913351.0, + "step": 11944 + }, + { + "epoch": 1.5195267777636432, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6988062858581543, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8549208641052246, + "num_tokens": 455947943.0, + "step": 11945 + }, + { + "epoch": 1.5196539880422337, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5884932279586792, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8736727237701416, + "num_tokens": 455985490.0, + "step": 11946 + }, + { + "epoch": 1.5197811983208243, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5231425762176514, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8796184659004211, + "num_tokens": 456024839.0, + "step": 11947 + }, + { + "epoch": 1.5199084085994148, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.535979986190796, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8786855936050415, + "num_tokens": 456065918.0, + "step": 11948 + }, + { + "epoch": 1.5200356188780053, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5221070051193237, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8583594560623169, + "num_tokens": 456110416.0, + "step": 11949 + }, + { + "epoch": 1.5201628291565958, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.583618402481079, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8867200613021851, + "num_tokens": 456146999.0, + "step": 11950 + }, + { + "epoch": 1.5202900394351864, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6873619556427002, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8706129789352417, + "num_tokens": 456182565.0, + "step": 11951 + }, + { + "epoch": 1.520417249713777, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5771781206130981, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8623626828193665, + "num_tokens": 456225390.0, + "step": 11952 + }, + { + "epoch": 1.5205444599923674, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.8105477094650269, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8546534776687622, + "num_tokens": 456260289.0, + "step": 11953 + }, + { + "epoch": 1.520671670270958, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.697553277015686, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8606635332107544, + "num_tokens": 456303720.0, + "step": 11954 + }, + { + "epoch": 1.5207988805495485, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.4859074354171753, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8811157941818237, + "num_tokens": 456346542.0, + "step": 11955 + }, + { + "epoch": 1.520926090828139, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6257703304290771, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8758017420768738, + "num_tokens": 456386791.0, + "step": 11956 + }, + { + "epoch": 1.5210533011067295, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.646061897277832, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8542851209640503, + "num_tokens": 456427135.0, + "step": 11957 + }, + { + "epoch": 1.52118051138532, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5449129343032837, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8647259473800659, + "num_tokens": 456470175.0, + "step": 11958 + }, + { + "epoch": 1.5213077216639106, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.8723055124282837, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8650107383728027, + "num_tokens": 456502023.0, + "step": 11959 + }, + { + "epoch": 1.521434931942501, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5398482084274292, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8807880282402039, + "num_tokens": 456540369.0, + "step": 11960 + }, + { + "epoch": 1.5215621422210914, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6571632623672485, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8672927618026733, + "num_tokens": 456578675.0, + "step": 11961 + }, + { + "epoch": 1.521689352499682, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6559420824050903, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8838412165641785, + "num_tokens": 456611052.0, + "step": 11962 + }, + { + "epoch": 1.5218165627782725, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6471613645553589, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.876368522644043, + "num_tokens": 456649038.0, + "step": 11963 + }, + { + "epoch": 1.521943773056863, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.72362220287323, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8787375092506409, + "num_tokens": 456687852.0, + "step": 11964 + }, + { + "epoch": 1.5220709833354535, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6146049499511719, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.880142092704773, + "num_tokens": 456722681.0, + "step": 11965 + }, + { + "epoch": 1.5221981936140438, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6422760486602783, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8493270874023438, + "num_tokens": 456767596.0, + "step": 11966 + }, + { + "epoch": 1.5223254038926344, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.4951837062835693, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.865260124206543, + "num_tokens": 456809352.0, + "step": 11967 + }, + { + "epoch": 1.522452614171225, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.611080288887024, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8634093403816223, + "num_tokens": 456848678.0, + "step": 11968 + }, + { + "epoch": 1.5225798244498154, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.692101240158081, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8738664388656616, + "num_tokens": 456882168.0, + "step": 11969 + }, + { + "epoch": 1.522707034728406, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7085245847702026, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8591359853744507, + "num_tokens": 456915410.0, + "step": 11970 + }, + { + "epoch": 1.5228342450069965, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5842589139938354, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8720957040786743, + "num_tokens": 456953588.0, + "step": 11971 + }, + { + "epoch": 1.522961455285587, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.4900250434875488, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8599708080291748, + "num_tokens": 456997893.0, + "step": 11972 + }, + { + "epoch": 1.5230886655641775, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.564772129058838, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8715192079544067, + "num_tokens": 457036439.0, + "step": 11973 + }, + { + "epoch": 1.523215875842768, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7117414474487305, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.866081178188324, + "num_tokens": 457069794.0, + "step": 11974 + }, + { + "epoch": 1.5233430861213586, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.620086669921875, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8606835603713989, + "num_tokens": 457108851.0, + "step": 11975 + }, + { + "epoch": 1.5234702963999491, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.537258267402649, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8744124174118042, + "num_tokens": 457149014.0, + "step": 11976 + }, + { + "epoch": 1.5235975066785397, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.8779617547988892, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8468474745750427, + "num_tokens": 457182503.0, + "step": 11977 + }, + { + "epoch": 1.5237247169571302, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7787920236587524, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.864319384098053, + "num_tokens": 457216627.0, + "step": 11978 + }, + { + "epoch": 1.5238519272357207, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7120258808135986, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8722852468490601, + "num_tokens": 457252646.0, + "step": 11979 + }, + { + "epoch": 1.5239791375143112, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.588767170906067, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8739018440246582, + "num_tokens": 457292895.0, + "step": 11980 + }, + { + "epoch": 1.5241063477929018, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6380945444107056, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8383395671844482, + "num_tokens": 457337923.0, + "step": 11981 + }, + { + "epoch": 1.5242335580714923, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5488768815994263, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8833997249603271, + "num_tokens": 457376112.0, + "step": 11982 + }, + { + "epoch": 1.5243607683500828, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5707173347473145, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.860193133354187, + "num_tokens": 457415458.0, + "step": 11983 + }, + { + "epoch": 1.5244879786286734, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7417045831680298, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8734606504440308, + "num_tokens": 457451987.0, + "step": 11984 + }, + { + "epoch": 1.5246151889072637, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.544146180152893, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.861356794834137, + "num_tokens": 457496883.0, + "step": 11985 + }, + { + "epoch": 1.5247423991858542, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.652500033378601, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8587121367454529, + "num_tokens": 457534712.0, + "step": 11986 + }, + { + "epoch": 1.5248696094644447, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5609418153762817, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8732159733772278, + "num_tokens": 457576995.0, + "step": 11987 + }, + { + "epoch": 1.5249968197430352, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 4.693609714508057, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8641873598098755, + "num_tokens": 457608736.0, + "step": 11988 + }, + { + "epoch": 1.5251240300216258, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.581007957458496, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8748817443847656, + "num_tokens": 457650515.0, + "step": 11989 + }, + { + "epoch": 1.5252512403002163, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.556046485900879, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8836238384246826, + "num_tokens": 457687133.0, + "step": 11990 + }, + { + "epoch": 1.5253784505788066, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.62395179271698, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8635321855545044, + "num_tokens": 457724487.0, + "step": 11991 + }, + { + "epoch": 1.5255056608573971, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5797162055969238, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8783384561538696, + "num_tokens": 457761781.0, + "step": 11992 + }, + { + "epoch": 1.5256328711359877, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6934576034545898, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8757162094116211, + "num_tokens": 457795312.0, + "step": 11993 + }, + { + "epoch": 1.5257600814145782, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.620274543762207, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8511946797370911, + "num_tokens": 457838466.0, + "step": 11994 + }, + { + "epoch": 1.5258872916931687, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.609175682067871, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.880387544631958, + "num_tokens": 457877515.0, + "step": 11995 + }, + { + "epoch": 1.5260145019717593, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7349799871444702, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.865850567817688, + "num_tokens": 457912949.0, + "step": 11996 + }, + { + "epoch": 1.5261417122503498, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6132302284240723, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8734457492828369, + "num_tokens": 457953409.0, + "step": 11997 + }, + { + "epoch": 1.5262689225289403, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7154656648635864, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8569245338439941, + "num_tokens": 457990866.0, + "step": 11998 + }, + { + "epoch": 1.5263961328075308, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5132818222045898, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8687063455581665, + "num_tokens": 458029887.0, + "step": 11999 + }, + { + "epoch": 1.5265233430861214, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6126140356063843, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8744561672210693, + "num_tokens": 458070985.0, + "step": 12000 + }, + { + "epoch": 1.526650553364712, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.744969367980957, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8636214137077332, + "num_tokens": 458109267.0, + "step": 12001 + }, + { + "epoch": 1.5267777636433024, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.71834397315979, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8585103154182434, + "num_tokens": 458148809.0, + "step": 12002 + }, + { + "epoch": 1.526904973921893, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5876015424728394, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8677350282669067, + "num_tokens": 458184899.0, + "step": 12003 + }, + { + "epoch": 1.5270321842004835, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5366371870040894, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8701352477073669, + "num_tokens": 458223965.0, + "step": 12004 + }, + { + "epoch": 1.527159394479074, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6780321598052979, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8559224009513855, + "num_tokens": 458258099.0, + "step": 12005 + }, + { + "epoch": 1.5272866047576645, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.4855958223342896, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8815149068832397, + "num_tokens": 458299649.0, + "step": 12006 + }, + { + "epoch": 1.527413815036255, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6480200290679932, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8735244274139404, + "num_tokens": 458340402.0, + "step": 12007 + }, + { + "epoch": 1.5275410253148456, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.8545558452606201, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8753484487533569, + "num_tokens": 458377580.0, + "step": 12008 + }, + { + "epoch": 1.527668235593436, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6223210096359253, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8761886954307556, + "num_tokens": 458417321.0, + "step": 12009 + }, + { + "epoch": 1.5277954458720264, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6108325719833374, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8727952241897583, + "num_tokens": 458453834.0, + "step": 12010 + }, + { + "epoch": 1.527922656150617, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.745856523513794, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8694136142730713, + "num_tokens": 458487506.0, + "step": 12011 + }, + { + "epoch": 1.5280498664292075, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6688648462295532, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8568744659423828, + "num_tokens": 458531871.0, + "step": 12012 + }, + { + "epoch": 1.528177076707798, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7780448198318481, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8617943525314331, + "num_tokens": 458567756.0, + "step": 12013 + }, + { + "epoch": 1.5283042869863885, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6038285493850708, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8709564208984375, + "num_tokens": 458609019.0, + "step": 12014 + }, + { + "epoch": 1.5284314972649788, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7517707347869873, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8761333227157593, + "num_tokens": 458646360.0, + "step": 12015 + }, + { + "epoch": 1.5285587075435694, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.70726478099823, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8658768534660339, + "num_tokens": 458682043.0, + "step": 12016 + }, + { + "epoch": 1.52868591782216, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5442273616790771, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.87211012840271, + "num_tokens": 458721960.0, + "step": 12017 + }, + { + "epoch": 1.5288131281007504, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.4898433685302734, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8793953657150269, + "num_tokens": 458759070.0, + "step": 12018 + }, + { + "epoch": 1.528940338379341, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.4889445304870605, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8847395181655884, + "num_tokens": 458799371.0, + "step": 12019 + }, + { + "epoch": 1.5290675486579315, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7129909992218018, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8737907409667969, + "num_tokens": 458832557.0, + "step": 12020 + }, + { + "epoch": 1.529194758936522, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6363273859024048, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8739978075027466, + "num_tokens": 458871660.0, + "step": 12021 + }, + { + "epoch": 1.5293219692151125, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7411589622497559, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8501979112625122, + "num_tokens": 458906546.0, + "step": 12022 + }, + { + "epoch": 1.529449179493703, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.8063340187072754, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8619825839996338, + "num_tokens": 458939776.0, + "step": 12023 + }, + { + "epoch": 1.5295763897722936, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6464215517044067, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8781204223632812, + "num_tokens": 458974654.0, + "step": 12024 + }, + { + "epoch": 1.5297036000508841, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.601596713066101, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8695298433303833, + "num_tokens": 459014454.0, + "step": 12025 + }, + { + "epoch": 1.5298308103294747, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5411490201950073, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8670774698257446, + "num_tokens": 459056500.0, + "step": 12026 + }, + { + "epoch": 1.5299580206080652, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5150068998336792, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8794173002243042, + "num_tokens": 459097090.0, + "step": 12027 + }, + { + "epoch": 1.5300852308866557, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.523185133934021, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8591580390930176, + "num_tokens": 459137721.0, + "step": 12028 + }, + { + "epoch": 1.5302124411652462, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6169285774230957, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8772938251495361, + "num_tokens": 459175325.0, + "step": 12029 + }, + { + "epoch": 1.5303396514438368, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6522098779678345, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8528822660446167, + "num_tokens": 459215656.0, + "step": 12030 + }, + { + "epoch": 1.5304668617224273, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5670174360275269, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8852506279945374, + "num_tokens": 459253995.0, + "step": 12031 + }, + { + "epoch": 1.5305940720010178, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.666609287261963, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8698422312736511, + "num_tokens": 459288033.0, + "step": 12032 + }, + { + "epoch": 1.5307212822796084, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6674730777740479, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8755128383636475, + "num_tokens": 459322695.0, + "step": 12033 + }, + { + "epoch": 1.5308484925581987, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.8246233463287354, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8723617196083069, + "num_tokens": 459354273.0, + "step": 12034 + }, + { + "epoch": 1.5309757028367892, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5531234741210938, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8813984990119934, + "num_tokens": 459394264.0, + "step": 12035 + }, + { + "epoch": 1.5311029131153797, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5882563591003418, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8639033436775208, + "num_tokens": 459435012.0, + "step": 12036 + }, + { + "epoch": 1.5312301233939702, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.664735198020935, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8807095289230347, + "num_tokens": 459468341.0, + "step": 12037 + }, + { + "epoch": 1.5313573336725608, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.833666205406189, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8645361065864563, + "num_tokens": 459501843.0, + "step": 12038 + }, + { + "epoch": 1.5314845439511513, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6217765808105469, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8793935775756836, + "num_tokens": 459537381.0, + "step": 12039 + }, + { + "epoch": 1.5316117542297416, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5338199138641357, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8601118326187134, + "num_tokens": 459580300.0, + "step": 12040 + }, + { + "epoch": 1.5317389645083321, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5968999862670898, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8639899492263794, + "num_tokens": 459617928.0, + "step": 12041 + }, + { + "epoch": 1.5318661747869227, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6684436798095703, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8552795052528381, + "num_tokens": 459657579.0, + "step": 12042 + }, + { + "epoch": 1.5319933850655132, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6548705101013184, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8652396202087402, + "num_tokens": 459695631.0, + "step": 12043 + }, + { + "epoch": 1.5321205953441037, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7032650709152222, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.871711015701294, + "num_tokens": 459731355.0, + "step": 12044 + }, + { + "epoch": 1.5322478056226942, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7040451765060425, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.881171464920044, + "num_tokens": 459762811.0, + "step": 12045 + }, + { + "epoch": 1.5323750159012848, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6363188028335571, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8651900887489319, + "num_tokens": 459802516.0, + "step": 12046 + }, + { + "epoch": 1.5325022261798753, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5758185386657715, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.878983199596405, + "num_tokens": 459837957.0, + "step": 12047 + }, + { + "epoch": 1.5326294364584658, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5467547178268433, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.863792359828949, + "num_tokens": 459878360.0, + "step": 12048 + }, + { + "epoch": 1.5327566467370564, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5845859050750732, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8730472922325134, + "num_tokens": 459920653.0, + "step": 12049 + }, + { + "epoch": 1.532883857015647, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5516027212142944, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.872763991355896, + "num_tokens": 459961852.0, + "step": 12050 + }, + { + "epoch": 1.5330110672942374, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6354327201843262, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.867007851600647, + "num_tokens": 459995591.0, + "step": 12051 + }, + { + "epoch": 1.533138277572828, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5520540475845337, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8803386092185974, + "num_tokens": 460036339.0, + "step": 12052 + }, + { + "epoch": 1.5332654878514185, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6933746337890625, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8821186423301697, + "num_tokens": 460066143.0, + "step": 12053 + }, + { + "epoch": 1.533392698130009, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7347946166992188, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8726541996002197, + "num_tokens": 460101392.0, + "step": 12054 + }, + { + "epoch": 1.5335199084085995, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7156596183776855, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8572235703468323, + "num_tokens": 460137657.0, + "step": 12055 + }, + { + "epoch": 1.53364711868719, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6542118787765503, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.872806191444397, + "num_tokens": 460181943.0, + "step": 12056 + }, + { + "epoch": 1.5337743289657806, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7023900747299194, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8570124506950378, + "num_tokens": 460223154.0, + "step": 12057 + }, + { + "epoch": 1.533901539244371, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6766093969345093, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8706430196762085, + "num_tokens": 460260330.0, + "step": 12058 + }, + { + "epoch": 1.5340287495229614, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6154849529266357, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8506923913955688, + "num_tokens": 460302422.0, + "step": 12059 + }, + { + "epoch": 1.534155959801552, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5026341676712036, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8867857456207275, + "num_tokens": 460345069.0, + "step": 12060 + }, + { + "epoch": 1.5342831700801425, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7657055854797363, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8583087921142578, + "num_tokens": 460382030.0, + "step": 12061 + }, + { + "epoch": 1.534410380358733, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7404011487960815, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8611968159675598, + "num_tokens": 460419134.0, + "step": 12062 + }, + { + "epoch": 1.5345375906373235, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5063533782958984, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8882186412811279, + "num_tokens": 460455909.0, + "step": 12063 + }, + { + "epoch": 1.5346648009159138, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6050339937210083, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8850627541542053, + "num_tokens": 460489108.0, + "step": 12064 + }, + { + "epoch": 1.5347920111945044, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7277265787124634, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.859083890914917, + "num_tokens": 460525343.0, + "step": 12065 + }, + { + "epoch": 1.534919221473095, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5146631002426147, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8723540306091309, + "num_tokens": 460569926.0, + "step": 12066 + }, + { + "epoch": 1.5350464317516854, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.734653353691101, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8612385988235474, + "num_tokens": 460610099.0, + "step": 12067 + }, + { + "epoch": 1.535173642030276, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6968536376953125, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8701109290122986, + "num_tokens": 460644140.0, + "step": 12068 + }, + { + "epoch": 1.5353008523088665, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5805332660675049, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8811107873916626, + "num_tokens": 460681888.0, + "step": 12069 + }, + { + "epoch": 1.535428062587457, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5461578369140625, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8879560232162476, + "num_tokens": 460721811.0, + "step": 12070 + }, + { + "epoch": 1.5355552728660475, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.8125534057617188, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8692044019699097, + "num_tokens": 460756732.0, + "step": 12071 + }, + { + "epoch": 1.535682483144638, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.7151424884796143, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8771936893463135, + "num_tokens": 460790267.0, + "step": 12072 + }, + { + "epoch": 1.5358096934232286, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.6616321802139282, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8835760951042175, + "num_tokens": 460822564.0, + "step": 12073 + }, + { + "epoch": 1.5359369037018191, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7574063539505005, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8647446036338806, + "num_tokens": 460859802.0, + "step": 12074 + }, + { + "epoch": 1.5360641139804097, + "ewc_loss": 2.3484230041503906e-05, + "grad_norm": 1.5673282146453857, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8760100603103638, + "num_tokens": 460899430.0, + "step": 12075 + }, + { + "epoch": 1.5361913242590002, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6400259733200073, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8786322474479675, + "num_tokens": 460935015.0, + "step": 12076 + }, + { + "epoch": 1.5363185345375907, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5365577936172485, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8600600957870483, + "num_tokens": 460974914.0, + "step": 12077 + }, + { + "epoch": 1.5364457448161812, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6857093572616577, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8681418895721436, + "num_tokens": 461010112.0, + "step": 12078 + }, + { + "epoch": 1.5365729550947718, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.627318263053894, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8508656024932861, + "num_tokens": 461051014.0, + "step": 12079 + }, + { + "epoch": 1.5367001653733623, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6395316123962402, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8639880418777466, + "num_tokens": 461087440.0, + "step": 12080 + }, + { + "epoch": 1.5368273756519528, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6883699893951416, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8688950538635254, + "num_tokens": 461122762.0, + "step": 12081 + }, + { + "epoch": 1.5369545859305433, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5756226778030396, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.868352472782135, + "num_tokens": 461164550.0, + "step": 12082 + }, + { + "epoch": 1.5370817962091337, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7020729780197144, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8615561127662659, + "num_tokens": 461201231.0, + "step": 12083 + }, + { + "epoch": 1.5372090064877242, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6245734691619873, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8834468722343445, + "num_tokens": 461235115.0, + "step": 12084 + }, + { + "epoch": 1.5373362167663147, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.829526424407959, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8598697185516357, + "num_tokens": 461268687.0, + "step": 12085 + }, + { + "epoch": 1.5374634270449052, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7509342432022095, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8775765299797058, + "num_tokens": 461307925.0, + "step": 12086 + }, + { + "epoch": 1.5375906373234958, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6559200286865234, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.862769603729248, + "num_tokens": 461347200.0, + "step": 12087 + }, + { + "epoch": 1.5377178476020863, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6050158739089966, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8701680302619934, + "num_tokens": 461389560.0, + "step": 12088 + }, + { + "epoch": 1.5378450578806766, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6913834810256958, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8595847487449646, + "num_tokens": 461424924.0, + "step": 12089 + }, + { + "epoch": 1.5379722681592671, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6641172170639038, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8698919415473938, + "num_tokens": 461463675.0, + "step": 12090 + }, + { + "epoch": 1.5380994784378577, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.8100757598876953, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8767519593238831, + "num_tokens": 461500707.0, + "step": 12091 + }, + { + "epoch": 1.5382266887164482, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6009552478790283, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8599095940589905, + "num_tokens": 461540624.0, + "step": 12092 + }, + { + "epoch": 1.5383538989950387, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4738078117370605, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.87317955493927, + "num_tokens": 461580347.0, + "step": 12093 + }, + { + "epoch": 1.5384811092736292, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7375226020812988, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8561120629310608, + "num_tokens": 461611384.0, + "step": 12094 + }, + { + "epoch": 1.5386083195522198, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6213185787200928, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8664553761482239, + "num_tokens": 461648340.0, + "step": 12095 + }, + { + "epoch": 1.5387355298308103, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4796857833862305, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8762189745903015, + "num_tokens": 461694839.0, + "step": 12096 + }, + { + "epoch": 1.5388627401094008, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6553351879119873, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8694765567779541, + "num_tokens": 461730247.0, + "step": 12097 + }, + { + "epoch": 1.5389899503879914, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.71802818775177, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8599299192428589, + "num_tokens": 461764329.0, + "step": 12098 + }, + { + "epoch": 1.5391171606665819, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5434401035308838, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8800643682479858, + "num_tokens": 461800734.0, + "step": 12099 + }, + { + "epoch": 1.5392443709451724, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 16.576519012451172, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8835750222206116, + "num_tokens": 461835367.0, + "step": 12100 + }, + { + "epoch": 1.539371581223763, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.60012948513031, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8772279024124146, + "num_tokens": 461872091.0, + "step": 12101 + }, + { + "epoch": 1.5394987915023535, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.757664680480957, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8703749775886536, + "num_tokens": 461908977.0, + "step": 12102 + }, + { + "epoch": 1.539626001780944, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5194157361984253, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8785340785980225, + "num_tokens": 461948472.0, + "step": 12103 + }, + { + "epoch": 1.5397532120595345, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5003763437271118, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8628987669944763, + "num_tokens": 461994196.0, + "step": 12104 + }, + { + "epoch": 1.539880422338125, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5670056343078613, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8687894344329834, + "num_tokens": 462036208.0, + "step": 12105 + }, + { + "epoch": 1.5400076326167156, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 3.7243027687072754, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8878955245018005, + "num_tokens": 462073176.0, + "step": 12106 + }, + { + "epoch": 1.5401348428953059, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7669075727462769, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8604394197463989, + "num_tokens": 462104030.0, + "step": 12107 + }, + { + "epoch": 1.5402620531738964, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.9381136894226074, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8642406463623047, + "num_tokens": 462137385.0, + "step": 12108 + }, + { + "epoch": 1.540389263452487, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.875887393951416, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8753699660301208, + "num_tokens": 462170433.0, + "step": 12109 + }, + { + "epoch": 1.5405164737310775, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5596373081207275, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8717271089553833, + "num_tokens": 462212516.0, + "step": 12110 + }, + { + "epoch": 1.540643684009668, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6284589767456055, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8737688064575195, + "num_tokens": 462248184.0, + "step": 12111 + }, + { + "epoch": 1.5407708942882585, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4274656772613525, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8723499774932861, + "num_tokens": 462290877.0, + "step": 12112 + }, + { + "epoch": 1.5408981045668488, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5340826511383057, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8783529996871948, + "num_tokens": 462331037.0, + "step": 12113 + }, + { + "epoch": 1.5410253148454394, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7006829977035522, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.866024374961853, + "num_tokens": 462365858.0, + "step": 12114 + }, + { + "epoch": 1.54115252512403, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.8336511850357056, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8631027936935425, + "num_tokens": 462395822.0, + "step": 12115 + }, + { + "epoch": 1.5412797354026204, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.674396276473999, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8733764886856079, + "num_tokens": 462431868.0, + "step": 12116 + }, + { + "epoch": 1.541406945681211, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7223265171051025, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8673414587974548, + "num_tokens": 462468418.0, + "step": 12117 + }, + { + "epoch": 1.5415341559598015, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.582115888595581, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8715075254440308, + "num_tokens": 462503657.0, + "step": 12118 + }, + { + "epoch": 1.541661366238392, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.753292441368103, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8725000619888306, + "num_tokens": 462535274.0, + "step": 12119 + }, + { + "epoch": 1.5417885765169825, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.606091856956482, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8744825124740601, + "num_tokens": 462572223.0, + "step": 12120 + }, + { + "epoch": 1.541915786795573, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5715564489364624, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8689454793930054, + "num_tokens": 462616096.0, + "step": 12121 + }, + { + "epoch": 1.5420429970741636, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7620326280593872, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8660269975662231, + "num_tokens": 462650325.0, + "step": 12122 + }, + { + "epoch": 1.5421702073527541, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5652958154678345, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8704712390899658, + "num_tokens": 462691515.0, + "step": 12123 + }, + { + "epoch": 1.5422974176313446, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.504981279373169, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8723363876342773, + "num_tokens": 462731127.0, + "step": 12124 + }, + { + "epoch": 1.5424246279099352, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5678516626358032, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8797285556793213, + "num_tokens": 462768525.0, + "step": 12125 + }, + { + "epoch": 1.5425518381885257, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5185837745666504, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.875968873500824, + "num_tokens": 462806616.0, + "step": 12126 + }, + { + "epoch": 1.5426790484671162, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5790138244628906, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8724486827850342, + "num_tokens": 462845698.0, + "step": 12127 + }, + { + "epoch": 1.5428062587457068, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6634202003479004, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8792117834091187, + "num_tokens": 462880818.0, + "step": 12128 + }, + { + "epoch": 1.5429334690242973, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5780915021896362, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8592531681060791, + "num_tokens": 462918696.0, + "step": 12129 + }, + { + "epoch": 1.5430606793028878, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6763179302215576, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8598339557647705, + "num_tokens": 462956670.0, + "step": 12130 + }, + { + "epoch": 1.5431878895814783, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6567753553390503, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8756828308105469, + "num_tokens": 462989159.0, + "step": 12131 + }, + { + "epoch": 1.5433150998600687, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.747357964515686, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8831582069396973, + "num_tokens": 463019775.0, + "step": 12132 + }, + { + "epoch": 1.5434423101386592, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5826181173324585, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8779604434967041, + "num_tokens": 463059264.0, + "step": 12133 + }, + { + "epoch": 1.5435695204172497, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.675499677658081, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8696297407150269, + "num_tokens": 463096300.0, + "step": 12134 + }, + { + "epoch": 1.5436967306958402, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7216322422027588, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8716219067573547, + "num_tokens": 463127241.0, + "step": 12135 + }, + { + "epoch": 1.5438239409744308, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.565440058708191, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8647578358650208, + "num_tokens": 463168077.0, + "step": 12136 + }, + { + "epoch": 1.5439511512530213, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5588597059249878, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8662675619125366, + "num_tokens": 463210548.0, + "step": 12137 + }, + { + "epoch": 1.5440783615316116, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4997559785842896, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8694018125534058, + "num_tokens": 463253325.0, + "step": 12138 + }, + { + "epoch": 1.5442055718102021, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5349596738815308, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8713661432266235, + "num_tokens": 463294799.0, + "step": 12139 + }, + { + "epoch": 1.5443327820887927, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7244926691055298, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8677029013633728, + "num_tokens": 463333525.0, + "step": 12140 + }, + { + "epoch": 1.5444599923673832, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6472841501235962, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.859106183052063, + "num_tokens": 463371459.0, + "step": 12141 + }, + { + "epoch": 1.5445872026459737, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5462688207626343, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8542947769165039, + "num_tokens": 463413189.0, + "step": 12142 + }, + { + "epoch": 1.5447144129245642, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7563531398773193, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.849474310874939, + "num_tokens": 463447674.0, + "step": 12143 + }, + { + "epoch": 1.5448416232031548, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5129276514053345, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8613696694374084, + "num_tokens": 463492360.0, + "step": 12144 + }, + { + "epoch": 1.5449688334817453, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6845866441726685, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8761804103851318, + "num_tokens": 463526155.0, + "step": 12145 + }, + { + "epoch": 1.5450960437603358, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6573660373687744, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8736230731010437, + "num_tokens": 463560427.0, + "step": 12146 + }, + { + "epoch": 1.5452232540389264, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7101330757141113, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8799334764480591, + "num_tokens": 463592400.0, + "step": 12147 + }, + { + "epoch": 1.5453504643175169, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6076680421829224, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8851529955863953, + "num_tokens": 463630972.0, + "step": 12148 + }, + { + "epoch": 1.5454776745961074, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7160639762878418, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8655861616134644, + "num_tokens": 463666943.0, + "step": 12149 + }, + { + "epoch": 1.545604884874698, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5950636863708496, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8777529001235962, + "num_tokens": 463707988.0, + "step": 12150 + }, + { + "epoch": 1.5457320951532885, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.698800802230835, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8635902404785156, + "num_tokens": 463745876.0, + "step": 12151 + }, + { + "epoch": 1.545859305431879, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7939385175704956, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8714425563812256, + "num_tokens": 463780511.0, + "step": 12152 + }, + { + "epoch": 1.5459865157104695, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.704558253288269, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8677241802215576, + "num_tokens": 463819189.0, + "step": 12153 + }, + { + "epoch": 1.54611372598906, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7197986841201782, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8864727020263672, + "num_tokens": 463853148.0, + "step": 12154 + }, + { + "epoch": 1.5462409362676506, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7374480962753296, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8567295074462891, + "num_tokens": 463886976.0, + "step": 12155 + }, + { + "epoch": 1.5463681465462409, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6150965690612793, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.859647274017334, + "num_tokens": 463926287.0, + "step": 12156 + }, + { + "epoch": 1.5464953568248314, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5989819765090942, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8728240132331848, + "num_tokens": 463962269.0, + "step": 12157 + }, + { + "epoch": 1.546622567103422, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6738308668136597, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8677963018417358, + "num_tokens": 463998322.0, + "step": 12158 + }, + { + "epoch": 1.5467497773820125, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7524768114089966, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8658728003501892, + "num_tokens": 464035752.0, + "step": 12159 + }, + { + "epoch": 1.546876987660603, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6826822757720947, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8667316436767578, + "num_tokens": 464071803.0, + "step": 12160 + }, + { + "epoch": 1.5470041979391935, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.544933795928955, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8708547353744507, + "num_tokens": 464114516.0, + "step": 12161 + }, + { + "epoch": 1.5471314082177838, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6101535558700562, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8743897676467896, + "num_tokens": 464150342.0, + "step": 12162 + }, + { + "epoch": 1.5472586184963744, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6352343559265137, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8731371164321899, + "num_tokens": 464187357.0, + "step": 12163 + }, + { + "epoch": 1.5473858287749649, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5835981369018555, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8769230842590332, + "num_tokens": 464224187.0, + "step": 12164 + }, + { + "epoch": 1.5475130390535554, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7159724235534668, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8677035570144653, + "num_tokens": 464256655.0, + "step": 12165 + }, + { + "epoch": 1.547640249332146, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6084771156311035, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8719696402549744, + "num_tokens": 464292466.0, + "step": 12166 + }, + { + "epoch": 1.5477674596107365, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5806676149368286, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.868110716342926, + "num_tokens": 464331154.0, + "step": 12167 + }, + { + "epoch": 1.547894669889327, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6360597610473633, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8590598106384277, + "num_tokens": 464371238.0, + "step": 12168 + }, + { + "epoch": 1.5480218801679175, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6952886581420898, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8652591705322266, + "num_tokens": 464407359.0, + "step": 12169 + }, + { + "epoch": 1.548149090446508, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6456910371780396, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8777140974998474, + "num_tokens": 464445374.0, + "step": 12170 + }, + { + "epoch": 1.5482763007250986, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4309699535369873, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8880546689033508, + "num_tokens": 464490291.0, + "step": 12171 + }, + { + "epoch": 1.5484035110036891, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5900903940200806, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8679630756378174, + "num_tokens": 464529206.0, + "step": 12172 + }, + { + "epoch": 1.5485307212822796, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6229264736175537, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8601685166358948, + "num_tokens": 464567662.0, + "step": 12173 + }, + { + "epoch": 1.5486579315608702, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.579847812652588, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.875338613986969, + "num_tokens": 464607620.0, + "step": 12174 + }, + { + "epoch": 1.5487851418394607, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7032705545425415, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8631693124771118, + "num_tokens": 464646093.0, + "step": 12175 + }, + { + "epoch": 1.5489123521180512, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.789982795715332, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8527111411094666, + "num_tokens": 464681440.0, + "step": 12176 + }, + { + "epoch": 1.5490395623966418, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6730546951293945, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8702304363250732, + "num_tokens": 464719943.0, + "step": 12177 + }, + { + "epoch": 1.5491667726752323, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6153372526168823, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8801586627960205, + "num_tokens": 464755060.0, + "step": 12178 + }, + { + "epoch": 1.5492939829538228, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.549696922302246, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8711763620376587, + "num_tokens": 464801000.0, + "step": 12179 + }, + { + "epoch": 1.5494211932324133, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.494517207145691, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8732476830482483, + "num_tokens": 464842949.0, + "step": 12180 + }, + { + "epoch": 1.5495484035110036, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5909010171890259, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8632811307907104, + "num_tokens": 464885548.0, + "step": 12181 + }, + { + "epoch": 1.5496756137895942, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6615419387817383, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8690472841262817, + "num_tokens": 464922021.0, + "step": 12182 + }, + { + "epoch": 1.5498028240681847, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5995625257492065, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8630176782608032, + "num_tokens": 464962087.0, + "step": 12183 + }, + { + "epoch": 1.5499300343467752, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6211514472961426, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8673433065414429, + "num_tokens": 465000182.0, + "step": 12184 + }, + { + "epoch": 1.5500572446253658, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4766825437545776, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8713747262954712, + "num_tokens": 465042404.0, + "step": 12185 + }, + { + "epoch": 1.5501844549039563, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6901264190673828, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8373304605484009, + "num_tokens": 465082862.0, + "step": 12186 + }, + { + "epoch": 1.5503116651825466, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.59761381149292, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8639192581176758, + "num_tokens": 465124468.0, + "step": 12187 + }, + { + "epoch": 1.5504388754611371, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.684593915939331, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8729507923126221, + "num_tokens": 465157276.0, + "step": 12188 + }, + { + "epoch": 1.5505660857397277, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.739562749862671, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8499047160148621, + "num_tokens": 465199352.0, + "step": 12189 + }, + { + "epoch": 1.5506932960183182, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7755191326141357, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8766297101974487, + "num_tokens": 465229561.0, + "step": 12190 + }, + { + "epoch": 1.5508205062969087, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.787523865699768, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.868756890296936, + "num_tokens": 465260553.0, + "step": 12191 + }, + { + "epoch": 1.5509477165754992, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6447844505310059, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8705530762672424, + "num_tokens": 465299635.0, + "step": 12192 + }, + { + "epoch": 1.5510749268540898, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6925581693649292, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8702831864356995, + "num_tokens": 465338248.0, + "step": 12193 + }, + { + "epoch": 1.5512021371326803, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4855782985687256, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8794965744018555, + "num_tokens": 465380874.0, + "step": 12194 + }, + { + "epoch": 1.5513293474112708, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.673670768737793, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8706644773483276, + "num_tokens": 465418484.0, + "step": 12195 + }, + { + "epoch": 1.5514565576898613, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6989104747772217, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8668258786201477, + "num_tokens": 465455680.0, + "step": 12196 + }, + { + "epoch": 1.5515837679684519, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5051487684249878, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8851364850997925, + "num_tokens": 465496367.0, + "step": 12197 + }, + { + "epoch": 1.5517109782470424, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7374632358551025, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8684359788894653, + "num_tokens": 465528654.0, + "step": 12198 + }, + { + "epoch": 1.551838188525633, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7317314147949219, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.872602105140686, + "num_tokens": 465559263.0, + "step": 12199 + }, + { + "epoch": 1.5519653988042235, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5613312721252441, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8769010901451111, + "num_tokens": 465598188.0, + "step": 12200 + }, + { + "epoch": 1.552092609082814, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6384621858596802, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8697108030319214, + "num_tokens": 465636365.0, + "step": 12201 + }, + { + "epoch": 1.5522198193614045, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6069074869155884, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8645690679550171, + "num_tokens": 465679697.0, + "step": 12202 + }, + { + "epoch": 1.552347029639995, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6393240690231323, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8716728687286377, + "num_tokens": 465717770.0, + "step": 12203 + }, + { + "epoch": 1.5524742399185856, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.523768663406372, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8792328834533691, + "num_tokens": 465759776.0, + "step": 12204 + }, + { + "epoch": 1.5526014501971759, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6077691316604614, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8689619898796082, + "num_tokens": 465800998.0, + "step": 12205 + }, + { + "epoch": 1.5527286604757664, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7018260955810547, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8574872016906738, + "num_tokens": 465837457.0, + "step": 12206 + }, + { + "epoch": 1.552855870754357, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4622786045074463, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8670305609703064, + "num_tokens": 465882186.0, + "step": 12207 + }, + { + "epoch": 1.5529830810329475, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4833135604858398, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.887127697467804, + "num_tokens": 465922754.0, + "step": 12208 + }, + { + "epoch": 1.553110291311538, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5583761930465698, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8716761469841003, + "num_tokens": 465959682.0, + "step": 12209 + }, + { + "epoch": 1.5532375015901285, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6663143634796143, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8760890960693359, + "num_tokens": 465994751.0, + "step": 12210 + }, + { + "epoch": 1.5533647118687188, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5420076847076416, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8739124536514282, + "num_tokens": 466032659.0, + "step": 12211 + }, + { + "epoch": 1.5534919221473094, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5110018253326416, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8841016292572021, + "num_tokens": 466069842.0, + "step": 12212 + }, + { + "epoch": 1.5536191324258999, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5267665386199951, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8824939727783203, + "num_tokens": 466110823.0, + "step": 12213 + }, + { + "epoch": 1.5537463427044904, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6168595552444458, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8675887584686279, + "num_tokens": 466146271.0, + "step": 12214 + }, + { + "epoch": 1.553873552983081, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5905406475067139, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8669447898864746, + "num_tokens": 466188999.0, + "step": 12215 + }, + { + "epoch": 1.5540007632616715, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6707087755203247, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8680692911148071, + "num_tokens": 466224314.0, + "step": 12216 + }, + { + "epoch": 1.554127973540262, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6844733953475952, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8585145473480225, + "num_tokens": 466260591.0, + "step": 12217 + }, + { + "epoch": 1.5542551838188525, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.615344524383545, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8669196367263794, + "num_tokens": 466298878.0, + "step": 12218 + }, + { + "epoch": 1.554382394097443, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5695399045944214, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8887735605239868, + "num_tokens": 466335156.0, + "step": 12219 + }, + { + "epoch": 1.5545096043760336, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7969534397125244, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.846408486366272, + "num_tokens": 466368972.0, + "step": 12220 + }, + { + "epoch": 1.554636814654624, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5863760709762573, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8675616979598999, + "num_tokens": 466406106.0, + "step": 12221 + }, + { + "epoch": 1.5547640249332146, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5898020267486572, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8602895140647888, + "num_tokens": 466449026.0, + "step": 12222 + }, + { + "epoch": 1.5548912352118052, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6016685962677002, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8706491589546204, + "num_tokens": 466485903.0, + "step": 12223 + }, + { + "epoch": 1.5550184454903957, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6000884771347046, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8596069812774658, + "num_tokens": 466527735.0, + "step": 12224 + }, + { + "epoch": 1.5551456557689862, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.593158483505249, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8835301399230957, + "num_tokens": 466565719.0, + "step": 12225 + }, + { + "epoch": 1.5552728660475768, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5206705331802368, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8631730079650879, + "num_tokens": 466605864.0, + "step": 12226 + }, + { + "epoch": 1.5554000763261673, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4680148363113403, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8801251649856567, + "num_tokens": 466647603.0, + "step": 12227 + }, + { + "epoch": 1.5555272866047578, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7119048833847046, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8713771104812622, + "num_tokens": 466678722.0, + "step": 12228 + }, + { + "epoch": 1.5556544968833483, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5708317756652832, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8700838685035706, + "num_tokens": 466719458.0, + "step": 12229 + }, + { + "epoch": 1.5557817071619386, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5707743167877197, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8785665035247803, + "num_tokens": 466759496.0, + "step": 12230 + }, + { + "epoch": 1.5559089174405292, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 3.6810905933380127, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8586427569389343, + "num_tokens": 466796849.0, + "step": 12231 + }, + { + "epoch": 1.5560361277191197, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6375865936279297, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8706169128417969, + "num_tokens": 466834405.0, + "step": 12232 + }, + { + "epoch": 1.5561633379977102, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7517075538635254, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8654282093048096, + "num_tokens": 466869140.0, + "step": 12233 + }, + { + "epoch": 1.5562905482763008, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.855883002281189, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8744913935661316, + "num_tokens": 466902029.0, + "step": 12234 + }, + { + "epoch": 1.5564177585548913, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4845328330993652, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.885118842124939, + "num_tokens": 466939378.0, + "step": 12235 + }, + { + "epoch": 1.5565449688334816, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4827773571014404, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8815895318984985, + "num_tokens": 466981859.0, + "step": 12236 + }, + { + "epoch": 1.5566721791120721, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.626158356666565, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8596750497817993, + "num_tokens": 467020992.0, + "step": 12237 + }, + { + "epoch": 1.5567993893906626, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 4.654300212860107, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8819880485534668, + "num_tokens": 467055376.0, + "step": 12238 + }, + { + "epoch": 1.5569265996692532, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.8278255462646484, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8527572154998779, + "num_tokens": 467091439.0, + "step": 12239 + }, + { + "epoch": 1.5570538099478437, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7390761375427246, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8656728267669678, + "num_tokens": 467124725.0, + "step": 12240 + }, + { + "epoch": 1.5571810202264342, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6879323720932007, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.868339478969574, + "num_tokens": 467160792.0, + "step": 12241 + }, + { + "epoch": 1.5573082305050248, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.530930995941162, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8663705587387085, + "num_tokens": 467202922.0, + "step": 12242 + }, + { + "epoch": 1.5574354407836153, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6061887741088867, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8547251224517822, + "num_tokens": 467244939.0, + "step": 12243 + }, + { + "epoch": 1.5575626510622058, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7105821371078491, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.857570469379425, + "num_tokens": 467278936.0, + "step": 12244 + }, + { + "epoch": 1.5576898613407963, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6656149625778198, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8704397678375244, + "num_tokens": 467314584.0, + "step": 12245 + }, + { + "epoch": 1.5578170716193869, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7141010761260986, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8670645356178284, + "num_tokens": 467350595.0, + "step": 12246 + }, + { + "epoch": 1.5579442818979774, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.9028213024139404, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8751546144485474, + "num_tokens": 467383430.0, + "step": 12247 + }, + { + "epoch": 1.558071492176568, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6452895402908325, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.861414909362793, + "num_tokens": 467420554.0, + "step": 12248 + }, + { + "epoch": 1.5581987024551585, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5126279592514038, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8821059465408325, + "num_tokens": 467460680.0, + "step": 12249 + }, + { + "epoch": 1.558325912733749, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5818709135055542, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8666880130767822, + "num_tokens": 467500798.0, + "step": 12250 + }, + { + "epoch": 1.5584531230123395, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.8723030090332031, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8513802289962769, + "num_tokens": 467531571.0, + "step": 12251 + }, + { + "epoch": 1.55858033329093, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5749599933624268, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8716729879379272, + "num_tokens": 467569485.0, + "step": 12252 + }, + { + "epoch": 1.5587075435695206, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7171578407287598, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8570351004600525, + "num_tokens": 467606901.0, + "step": 12253 + }, + { + "epoch": 1.5588347538481109, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5454381704330444, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8619663715362549, + "num_tokens": 467648240.0, + "step": 12254 + }, + { + "epoch": 1.5589619641267014, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7318689823150635, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8688861131668091, + "num_tokens": 467683011.0, + "step": 12255 + }, + { + "epoch": 1.559089174405292, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6475474834442139, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8659490942955017, + "num_tokens": 467719574.0, + "step": 12256 + }, + { + "epoch": 1.5592163846838825, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.51860511302948, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8721877336502075, + "num_tokens": 467758966.0, + "step": 12257 + }, + { + "epoch": 1.559343594962473, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6045559644699097, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8698067665100098, + "num_tokens": 467795656.0, + "step": 12258 + }, + { + "epoch": 1.5594708052410635, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6668879985809326, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8690319061279297, + "num_tokens": 467834345.0, + "step": 12259 + }, + { + "epoch": 1.5595980155196538, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5984575748443604, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8732937574386597, + "num_tokens": 467870732.0, + "step": 12260 + }, + { + "epoch": 1.5597252257982444, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6043678522109985, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8736978769302368, + "num_tokens": 467909760.0, + "step": 12261 + }, + { + "epoch": 1.5598524360768349, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5535091161727905, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.879963755607605, + "num_tokens": 467949392.0, + "step": 12262 + }, + { + "epoch": 1.5599796463554254, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5430083274841309, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8843817114830017, + "num_tokens": 467984229.0, + "step": 12263 + }, + { + "epoch": 1.560106856634016, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.630488395690918, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8543563485145569, + "num_tokens": 468025287.0, + "step": 12264 + }, + { + "epoch": 1.5602340669126065, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.568563461303711, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8677558302879333, + "num_tokens": 468063875.0, + "step": 12265 + }, + { + "epoch": 1.560361277191197, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6013071537017822, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8673854470252991, + "num_tokens": 468103145.0, + "step": 12266 + }, + { + "epoch": 1.5604884874697875, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7588248252868652, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8793430328369141, + "num_tokens": 468137265.0, + "step": 12267 + }, + { + "epoch": 1.560615697748378, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7140326499938965, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8776390552520752, + "num_tokens": 468169031.0, + "step": 12268 + }, + { + "epoch": 1.5607429080269686, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.792253851890564, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8629598617553711, + "num_tokens": 468203289.0, + "step": 12269 + }, + { + "epoch": 1.560870118305559, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6439547538757324, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8599741458892822, + "num_tokens": 468245509.0, + "step": 12270 + }, + { + "epoch": 1.5609973285841496, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6407233476638794, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8751709461212158, + "num_tokens": 468279428.0, + "step": 12271 + }, + { + "epoch": 1.5611245388627402, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4930671453475952, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8786851763725281, + "num_tokens": 468317189.0, + "step": 12272 + }, + { + "epoch": 1.5612517491413307, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4882688522338867, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8851571679115295, + "num_tokens": 468358555.0, + "step": 12273 + }, + { + "epoch": 1.5613789594199212, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.538976788520813, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8664652109146118, + "num_tokens": 468400216.0, + "step": 12274 + }, + { + "epoch": 1.5615061696985117, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.56583833694458, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8776758909225464, + "num_tokens": 468434342.0, + "step": 12275 + }, + { + "epoch": 1.5616333799771023, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.7424520254135132, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8626706600189209, + "num_tokens": 468467344.0, + "step": 12276 + }, + { + "epoch": 1.5617605902556928, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6886569261550903, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.869004487991333, + "num_tokens": 468501884.0, + "step": 12277 + }, + { + "epoch": 1.5618878005342833, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5125025510787964, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8860865235328674, + "num_tokens": 468541952.0, + "step": 12278 + }, + { + "epoch": 1.5620150108128736, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7417150735855103, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8843830823898315, + "num_tokens": 468573789.0, + "step": 12279 + }, + { + "epoch": 1.5621422210914642, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6109169721603394, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8874384164810181, + "num_tokens": 468607696.0, + "step": 12280 + }, + { + "epoch": 1.5622694313700547, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7677662372589111, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8744995594024658, + "num_tokens": 468642835.0, + "step": 12281 + }, + { + "epoch": 1.5623966416486452, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8353604078292847, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8700497150421143, + "num_tokens": 468674444.0, + "step": 12282 + }, + { + "epoch": 1.5625238519272358, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4880985021591187, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8822129964828491, + "num_tokens": 468716729.0, + "step": 12283 + }, + { + "epoch": 1.5626510622058263, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6582740545272827, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8688929080963135, + "num_tokens": 468753561.0, + "step": 12284 + }, + { + "epoch": 1.5627782724844166, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5915584564208984, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8670828342437744, + "num_tokens": 468795328.0, + "step": 12285 + }, + { + "epoch": 1.5629054827630071, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5227704048156738, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8732272386550903, + "num_tokens": 468835766.0, + "step": 12286 + }, + { + "epoch": 1.5630326930415976, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6950466632843018, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8633496761322021, + "num_tokens": 468867905.0, + "step": 12287 + }, + { + "epoch": 1.5631599033201882, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6563547849655151, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8604404926300049, + "num_tokens": 468906695.0, + "step": 12288 + }, + { + "epoch": 1.5632871135987787, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6479655504226685, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8613472580909729, + "num_tokens": 468940754.0, + "step": 12289 + }, + { + "epoch": 1.5634143238773692, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6459097862243652, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8678378462791443, + "num_tokens": 468979383.0, + "step": 12290 + }, + { + "epoch": 1.5635415341559598, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6559501886367798, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8685956001281738, + "num_tokens": 469017020.0, + "step": 12291 + }, + { + "epoch": 1.5636687444345503, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6117875576019287, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8743084073066711, + "num_tokens": 469060036.0, + "step": 12292 + }, + { + "epoch": 1.5637959547131408, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5139970779418945, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8878747224807739, + "num_tokens": 469101064.0, + "step": 12293 + }, + { + "epoch": 1.5639231649917313, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4967615604400635, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.879623293876648, + "num_tokens": 469144476.0, + "step": 12294 + }, + { + "epoch": 1.5640503752703219, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4648500680923462, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.882143497467041, + "num_tokens": 469183541.0, + "step": 12295 + }, + { + "epoch": 1.5641775855489124, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5518798828125, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8707972168922424, + "num_tokens": 469221456.0, + "step": 12296 + }, + { + "epoch": 1.564304795827503, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6050810813903809, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8690357804298401, + "num_tokens": 469267000.0, + "step": 12297 + }, + { + "epoch": 1.5644320061060935, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7134984731674194, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8726414442062378, + "num_tokens": 469306360.0, + "step": 12298 + }, + { + "epoch": 1.564559216384684, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7575457096099854, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8477240800857544, + "num_tokens": 469344074.0, + "step": 12299 + }, + { + "epoch": 1.5646864266632745, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5259946584701538, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8858433961868286, + "num_tokens": 469384468.0, + "step": 12300 + }, + { + "epoch": 1.564813636941865, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7181771993637085, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8483469486236572, + "num_tokens": 469423823.0, + "step": 12301 + }, + { + "epoch": 1.5649408472204556, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5060898065567017, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8684989809989929, + "num_tokens": 469465815.0, + "step": 12302 + }, + { + "epoch": 1.5650680574990459, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5940423011779785, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8844603896141052, + "num_tokens": 469500284.0, + "step": 12303 + }, + { + "epoch": 1.5651952677776364, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.602677822113037, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8817342519760132, + "num_tokens": 469540881.0, + "step": 12304 + }, + { + "epoch": 1.565322478056227, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6720365285873413, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8593412041664124, + "num_tokens": 469579170.0, + "step": 12305 + }, + { + "epoch": 1.5654496883348175, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6273707151412964, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8833619356155396, + "num_tokens": 469610849.0, + "step": 12306 + }, + { + "epoch": 1.565576898613408, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6720207929611206, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8664888143539429, + "num_tokens": 469647420.0, + "step": 12307 + }, + { + "epoch": 1.5657041088919985, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.507462978363037, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8648035526275635, + "num_tokens": 469692412.0, + "step": 12308 + }, + { + "epoch": 1.5658313191705888, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4904965162277222, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8830336332321167, + "num_tokens": 469733513.0, + "step": 12309 + }, + { + "epoch": 1.5659585294491793, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.647283673286438, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8696045279502869, + "num_tokens": 469769687.0, + "step": 12310 + }, + { + "epoch": 1.5660857397277699, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6572738885879517, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8656432628631592, + "num_tokens": 469806047.0, + "step": 12311 + }, + { + "epoch": 1.5662129500063604, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.4909294843673706, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8749592304229736, + "num_tokens": 469845518.0, + "step": 12312 + }, + { + "epoch": 1.566340160284951, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5554403066635132, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8761335015296936, + "num_tokens": 469881123.0, + "step": 12313 + }, + { + "epoch": 1.5664673705635415, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.6095075607299805, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8656553030014038, + "num_tokens": 469920182.0, + "step": 12314 + }, + { + "epoch": 1.566594580842132, + "ewc_loss": 2.3603439331054688e-05, + "grad_norm": 1.5822540521621704, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.877898097038269, + "num_tokens": 469955739.0, + "step": 12315 + }, + { + "epoch": 1.5667217911207225, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5624068975448608, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.866070568561554, + "num_tokens": 469997839.0, + "step": 12316 + }, + { + "epoch": 1.566849001399313, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7008862495422363, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8599991202354431, + "num_tokens": 470034237.0, + "step": 12317 + }, + { + "epoch": 1.5669762116779036, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5667462348937988, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8673674464225769, + "num_tokens": 470073855.0, + "step": 12318 + }, + { + "epoch": 1.567103421956494, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.623154878616333, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8722928166389465, + "num_tokens": 470108995.0, + "step": 12319 + }, + { + "epoch": 1.5672306322350846, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6937315464019775, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.874116063117981, + "num_tokens": 470141944.0, + "step": 12320 + }, + { + "epoch": 1.5673578425136752, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4631686210632324, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8765168190002441, + "num_tokens": 470182710.0, + "step": 12321 + }, + { + "epoch": 1.5674850527922657, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7530887126922607, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8601148128509521, + "num_tokens": 470220732.0, + "step": 12322 + }, + { + "epoch": 1.5676122630708562, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6728622913360596, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8696789741516113, + "num_tokens": 470253965.0, + "step": 12323 + }, + { + "epoch": 1.5677394733494467, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5874571800231934, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8630797266960144, + "num_tokens": 470295955.0, + "step": 12324 + }, + { + "epoch": 1.5678666836280373, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5689154863357544, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8837043046951294, + "num_tokens": 470336981.0, + "step": 12325 + }, + { + "epoch": 1.5679938939066278, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7366670370101929, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8592884540557861, + "num_tokens": 470375570.0, + "step": 12326 + }, + { + "epoch": 1.5681211041852183, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.624018669128418, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8585506677627563, + "num_tokens": 470413476.0, + "step": 12327 + }, + { + "epoch": 1.5682483144638086, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 2.122983455657959, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8654376268386841, + "num_tokens": 470447404.0, + "step": 12328 + }, + { + "epoch": 1.5683755247423992, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4882558584213257, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8772149085998535, + "num_tokens": 470489469.0, + "step": 12329 + }, + { + "epoch": 1.5685027350209897, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5598335266113281, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8795844316482544, + "num_tokens": 470531519.0, + "step": 12330 + }, + { + "epoch": 1.5686299452995802, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5297253131866455, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.868464469909668, + "num_tokens": 470571897.0, + "step": 12331 + }, + { + "epoch": 1.5687571555781707, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.746211051940918, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8726593255996704, + "num_tokens": 470602687.0, + "step": 12332 + }, + { + "epoch": 1.5688843658567613, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.522817611694336, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8648710250854492, + "num_tokens": 470642744.0, + "step": 12333 + }, + { + "epoch": 1.5690115761353516, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5546554327011108, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8585572838783264, + "num_tokens": 470688411.0, + "step": 12334 + }, + { + "epoch": 1.569138786413942, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4843614101409912, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.877615213394165, + "num_tokens": 470729131.0, + "step": 12335 + }, + { + "epoch": 1.5692659966925326, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.760605812072754, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8582714796066284, + "num_tokens": 470766942.0, + "step": 12336 + }, + { + "epoch": 1.5693932069711232, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5429189205169678, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8859744071960449, + "num_tokens": 470807590.0, + "step": 12337 + }, + { + "epoch": 1.5695204172497137, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.637467861175537, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8686939477920532, + "num_tokens": 470842468.0, + "step": 12338 + }, + { + "epoch": 1.5696476275283042, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.637449026107788, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8689879775047302, + "num_tokens": 470881675.0, + "step": 12339 + }, + { + "epoch": 1.5697748378068948, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6273001432418823, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8432002067565918, + "num_tokens": 470922594.0, + "step": 12340 + }, + { + "epoch": 1.5699020480854853, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.638198733329773, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8648662567138672, + "num_tokens": 470960197.0, + "step": 12341 + }, + { + "epoch": 1.5700292583640758, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6239169836044312, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8503673672676086, + "num_tokens": 470997948.0, + "step": 12342 + }, + { + "epoch": 1.5701564686426663, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7389862537384033, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8559412956237793, + "num_tokens": 471032499.0, + "step": 12343 + }, + { + "epoch": 1.5702836789212569, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5160235166549683, + "learning_rate": 1e-06, + "loss": 0.3287, + "mean_token_accuracy": 0.8922006487846375, + "num_tokens": 471070603.0, + "step": 12344 + }, + { + "epoch": 1.5704108891998474, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7077031135559082, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8580071330070496, + "num_tokens": 471108651.0, + "step": 12345 + }, + { + "epoch": 1.570538099478438, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6481127738952637, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.878093957901001, + "num_tokens": 471145165.0, + "step": 12346 + }, + { + "epoch": 1.5706653097570284, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7834478616714478, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8770126104354858, + "num_tokens": 471175289.0, + "step": 12347 + }, + { + "epoch": 1.570792520035619, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4951086044311523, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8700284957885742, + "num_tokens": 471217080.0, + "step": 12348 + }, + { + "epoch": 1.5709197303142095, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5552831888198853, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8742403984069824, + "num_tokens": 471255892.0, + "step": 12349 + }, + { + "epoch": 1.5710469405928, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6513009071350098, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8715037703514099, + "num_tokens": 471293010.0, + "step": 12350 + }, + { + "epoch": 1.5711741508713906, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.613012671470642, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8634436130523682, + "num_tokens": 471332798.0, + "step": 12351 + }, + { + "epoch": 1.5713013611499809, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6865564584732056, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8768664598464966, + "num_tokens": 471364323.0, + "step": 12352 + }, + { + "epoch": 1.5714285714285714, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.699576497077942, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8693805932998657, + "num_tokens": 471399905.0, + "step": 12353 + }, + { + "epoch": 1.571555781707162, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6053014993667603, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8700293302536011, + "num_tokens": 471434612.0, + "step": 12354 + }, + { + "epoch": 1.5716829919857525, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7374824285507202, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8619886636734009, + "num_tokens": 471467691.0, + "step": 12355 + }, + { + "epoch": 1.571810202264343, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5988316535949707, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8762621879577637, + "num_tokens": 471505674.0, + "step": 12356 + }, + { + "epoch": 1.5719374125429335, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6959373950958252, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8617314696311951, + "num_tokens": 471541478.0, + "step": 12357 + }, + { + "epoch": 1.5720646228215238, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.558448076248169, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.871234655380249, + "num_tokens": 471583420.0, + "step": 12358 + }, + { + "epoch": 1.5721918331001143, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5651897192001343, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8908750414848328, + "num_tokens": 471618010.0, + "step": 12359 + }, + { + "epoch": 1.5723190433787049, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5819127559661865, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8699779510498047, + "num_tokens": 471658210.0, + "step": 12360 + }, + { + "epoch": 1.5724462536572954, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5295571088790894, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.879233717918396, + "num_tokens": 471698036.0, + "step": 12361 + }, + { + "epoch": 1.572573463935886, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5853313207626343, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8798688650131226, + "num_tokens": 471735138.0, + "step": 12362 + }, + { + "epoch": 1.5727006742144765, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6004033088684082, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8713414669036865, + "num_tokens": 471771887.0, + "step": 12363 + }, + { + "epoch": 1.572827884493067, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7389882802963257, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8621000647544861, + "num_tokens": 471811896.0, + "step": 12364 + }, + { + "epoch": 1.5729550947716575, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7655540704727173, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8553801774978638, + "num_tokens": 471845273.0, + "step": 12365 + }, + { + "epoch": 1.573082305050248, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5501435995101929, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8654927611351013, + "num_tokens": 471888076.0, + "step": 12366 + }, + { + "epoch": 1.5732095153288386, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6979552507400513, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8601408004760742, + "num_tokens": 471924254.0, + "step": 12367 + }, + { + "epoch": 1.573336725607429, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.635582685470581, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8735984563827515, + "num_tokens": 471959935.0, + "step": 12368 + }, + { + "epoch": 1.5734639358860196, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6485432386398315, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8790441751480103, + "num_tokens": 471995982.0, + "step": 12369 + }, + { + "epoch": 1.5735911461646102, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7751500606536865, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8696739673614502, + "num_tokens": 472026890.0, + "step": 12370 + }, + { + "epoch": 1.5737183564432007, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6742241382598877, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8656229972839355, + "num_tokens": 472061877.0, + "step": 12371 + }, + { + "epoch": 1.5738455667217912, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4920469522476196, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8829388618469238, + "num_tokens": 472102010.0, + "step": 12372 + }, + { + "epoch": 1.5739727770003817, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6517126560211182, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8780127763748169, + "num_tokens": 472138044.0, + "step": 12373 + }, + { + "epoch": 1.5740999872789723, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7140265703201294, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8589379191398621, + "num_tokens": 472176005.0, + "step": 12374 + }, + { + "epoch": 1.5742271975575628, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4542615413665771, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8717089891433716, + "num_tokens": 472223953.0, + "step": 12375 + }, + { + "epoch": 1.5743544078361533, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5984728336334229, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8658055067062378, + "num_tokens": 472266915.0, + "step": 12376 + }, + { + "epoch": 1.5744816181147436, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7105382680892944, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8716692924499512, + "num_tokens": 472300530.0, + "step": 12377 + }, + { + "epoch": 1.5746088283933342, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5827772617340088, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8792542815208435, + "num_tokens": 472337952.0, + "step": 12378 + }, + { + "epoch": 1.5747360386719247, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.523327350616455, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8796871900558472, + "num_tokens": 472377219.0, + "step": 12379 + }, + { + "epoch": 1.5748632489505152, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.521545171737671, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8787944316864014, + "num_tokens": 472416919.0, + "step": 12380 + }, + { + "epoch": 1.5749904592291057, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6045295000076294, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.875687837600708, + "num_tokens": 472458209.0, + "step": 12381 + }, + { + "epoch": 1.5751176695076963, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8164291381835938, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8641848564147949, + "num_tokens": 472494010.0, + "step": 12382 + }, + { + "epoch": 1.5752448797862866, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5629897117614746, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8842360377311707, + "num_tokens": 472537647.0, + "step": 12383 + }, + { + "epoch": 1.575372090064877, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.642896056175232, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8575947284698486, + "num_tokens": 472578228.0, + "step": 12384 + }, + { + "epoch": 1.5754993003434676, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5283633470535278, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.8887669444084167, + "num_tokens": 472612215.0, + "step": 12385 + }, + { + "epoch": 1.5756265106220582, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5827999114990234, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8662521243095398, + "num_tokens": 472651786.0, + "step": 12386 + }, + { + "epoch": 1.5757537209006487, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.59744393825531, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8705410361289978, + "num_tokens": 472691148.0, + "step": 12387 + }, + { + "epoch": 1.5758809311792392, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5120632648468018, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.885603666305542, + "num_tokens": 472726631.0, + "step": 12388 + }, + { + "epoch": 1.5760081414578297, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.541006326675415, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8818013668060303, + "num_tokens": 472766653.0, + "step": 12389 + }, + { + "epoch": 1.5761353517364203, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7299435138702393, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8783067464828491, + "num_tokens": 472801732.0, + "step": 12390 + }, + { + "epoch": 1.5762625620150108, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5860850811004639, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8607664108276367, + "num_tokens": 472841785.0, + "step": 12391 + }, + { + "epoch": 1.5763897722936013, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7215932607650757, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8706932067871094, + "num_tokens": 472876444.0, + "step": 12392 + }, + { + "epoch": 1.5765169825721919, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5469106435775757, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.873481035232544, + "num_tokens": 472912839.0, + "step": 12393 + }, + { + "epoch": 1.5766441928507824, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.610830307006836, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8822821378707886, + "num_tokens": 472952371.0, + "step": 12394 + }, + { + "epoch": 1.576771403129373, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6039096117019653, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8776552081108093, + "num_tokens": 472990184.0, + "step": 12395 + }, + { + "epoch": 1.5768986134079634, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7418467998504639, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8642804026603699, + "num_tokens": 473024083.0, + "step": 12396 + }, + { + "epoch": 1.577025823686554, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.530775785446167, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8764311075210571, + "num_tokens": 473061843.0, + "step": 12397 + }, + { + "epoch": 1.5771530339651445, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6450294256210327, + "learning_rate": 1e-06, + "loss": 0.3175, + "mean_token_accuracy": 0.8972426652908325, + "num_tokens": 473099073.0, + "step": 12398 + }, + { + "epoch": 1.577280244243735, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7466199398040771, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8723993301391602, + "num_tokens": 473133116.0, + "step": 12399 + }, + { + "epoch": 1.5774074545223256, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.705575942993164, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8704926371574402, + "num_tokens": 473172890.0, + "step": 12400 + }, + { + "epoch": 1.5775346648009159, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6328895092010498, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8724563121795654, + "num_tokens": 473207988.0, + "step": 12401 + }, + { + "epoch": 1.5776618750795064, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5324558019638062, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8691115379333496, + "num_tokens": 473249393.0, + "step": 12402 + }, + { + "epoch": 1.577789085358097, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.64606511592865, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8678202629089355, + "num_tokens": 473287984.0, + "step": 12403 + }, + { + "epoch": 1.5779162956366874, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7046709060668945, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8727068901062012, + "num_tokens": 473327296.0, + "step": 12404 + }, + { + "epoch": 1.578043505915278, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.765142560005188, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8672885894775391, + "num_tokens": 473358694.0, + "step": 12405 + }, + { + "epoch": 1.5781707161938685, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5662261247634888, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8850440979003906, + "num_tokens": 473401109.0, + "step": 12406 + }, + { + "epoch": 1.5782979264724588, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6307655572891235, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8676141500473022, + "num_tokens": 473438307.0, + "step": 12407 + }, + { + "epoch": 1.5784251367510493, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6918916702270508, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8664114475250244, + "num_tokens": 473475516.0, + "step": 12408 + }, + { + "epoch": 1.5785523470296399, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.563266634941101, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8751257061958313, + "num_tokens": 473515062.0, + "step": 12409 + }, + { + "epoch": 1.5786795573082304, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5420211553573608, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8867388963699341, + "num_tokens": 473552754.0, + "step": 12410 + }, + { + "epoch": 1.578806767586821, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6588062047958374, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.884103536605835, + "num_tokens": 473585096.0, + "step": 12411 + }, + { + "epoch": 1.5789339778654115, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7122389078140259, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8595646619796753, + "num_tokens": 473622638.0, + "step": 12412 + }, + { + "epoch": 1.579061188144002, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4084358215332031, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8867472410202026, + "num_tokens": 473663933.0, + "step": 12413 + }, + { + "epoch": 1.5791883984225925, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6172196865081787, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8662877082824707, + "num_tokens": 473703077.0, + "step": 12414 + }, + { + "epoch": 1.579315608701183, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5761433839797974, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8694887161254883, + "num_tokens": 473744003.0, + "step": 12415 + }, + { + "epoch": 1.5794428189797736, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.520450234413147, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8718568086624146, + "num_tokens": 473782374.0, + "step": 12416 + }, + { + "epoch": 1.579570029258364, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.55547034740448, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8674051761627197, + "num_tokens": 473818032.0, + "step": 12417 + }, + { + "epoch": 1.5796972395369546, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6762624979019165, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8568153381347656, + "num_tokens": 473859535.0, + "step": 12418 + }, + { + "epoch": 1.5798244498155452, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6402497291564941, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8795151114463806, + "num_tokens": 473896594.0, + "step": 12419 + }, + { + "epoch": 1.5799516600941357, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6213515996932983, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8566839694976807, + "num_tokens": 473938594.0, + "step": 12420 + }, + { + "epoch": 1.5800788703727262, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.578657627105713, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8744813203811646, + "num_tokens": 473976924.0, + "step": 12421 + }, + { + "epoch": 1.5802060806513167, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.55532705783844, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8777329325675964, + "num_tokens": 474017146.0, + "step": 12422 + }, + { + "epoch": 1.5803332909299073, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.587652325630188, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8789176940917969, + "num_tokens": 474056138.0, + "step": 12423 + }, + { + "epoch": 1.5804605012084978, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7003180980682373, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.869683563709259, + "num_tokens": 474090312.0, + "step": 12424 + }, + { + "epoch": 1.5805877114870883, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5811179876327515, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.864196240901947, + "num_tokens": 474130410.0, + "step": 12425 + }, + { + "epoch": 1.5807149217656786, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5331202745437622, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8647010326385498, + "num_tokens": 474172760.0, + "step": 12426 + }, + { + "epoch": 1.5808421320442692, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5866285562515259, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.867444634437561, + "num_tokens": 474209036.0, + "step": 12427 + }, + { + "epoch": 1.5809693423228597, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6866223812103271, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8594908714294434, + "num_tokens": 474248220.0, + "step": 12428 + }, + { + "epoch": 1.5810965526014502, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4500378370285034, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8864346742630005, + "num_tokens": 474290711.0, + "step": 12429 + }, + { + "epoch": 1.5812237628800407, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6152633428573608, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8658862113952637, + "num_tokens": 474327614.0, + "step": 12430 + }, + { + "epoch": 1.5813509731586313, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4817707538604736, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8626763224601746, + "num_tokens": 474373401.0, + "step": 12431 + }, + { + "epoch": 1.5814781834372216, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5796934366226196, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8754250407218933, + "num_tokens": 474410914.0, + "step": 12432 + }, + { + "epoch": 1.581605393715812, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.506201982498169, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8843481540679932, + "num_tokens": 474450356.0, + "step": 12433 + }, + { + "epoch": 1.5817326039944026, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6597263813018799, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8547118306159973, + "num_tokens": 474490476.0, + "step": 12434 + }, + { + "epoch": 1.5818598142729932, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4128338098526, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8804634213447571, + "num_tokens": 474535074.0, + "step": 12435 + }, + { + "epoch": 1.5819870245515837, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5560396909713745, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8785823583602905, + "num_tokens": 474574564.0, + "step": 12436 + }, + { + "epoch": 1.5821142348301742, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6728808879852295, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8685871958732605, + "num_tokens": 474611256.0, + "step": 12437 + }, + { + "epoch": 1.5822414451087647, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6630895137786865, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8625906705856323, + "num_tokens": 474651878.0, + "step": 12438 + }, + { + "epoch": 1.5823686553873553, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7021366357803345, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8654220104217529, + "num_tokens": 474689832.0, + "step": 12439 + }, + { + "epoch": 1.5824958656659458, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.403648853302002, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8926714658737183, + "num_tokens": 474731220.0, + "step": 12440 + }, + { + "epoch": 1.5826230759445363, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 3.8494577407836914, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8469341993331909, + "num_tokens": 474767434.0, + "step": 12441 + }, + { + "epoch": 1.5827502862231269, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.594193458557129, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8918544054031372, + "num_tokens": 474804770.0, + "step": 12442 + }, + { + "epoch": 1.5828774965017174, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7120771408081055, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8650566339492798, + "num_tokens": 474840228.0, + "step": 12443 + }, + { + "epoch": 1.583004706780308, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6890867948532104, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8618342876434326, + "num_tokens": 474874545.0, + "step": 12444 + }, + { + "epoch": 1.5831319170588984, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6045480966567993, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8803147077560425, + "num_tokens": 474913113.0, + "step": 12445 + }, + { + "epoch": 1.583259127337489, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6101199388504028, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8637501001358032, + "num_tokens": 474953624.0, + "step": 12446 + }, + { + "epoch": 1.5833863376160795, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7742319107055664, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8726800680160522, + "num_tokens": 474989408.0, + "step": 12447 + }, + { + "epoch": 1.58351354789467, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7229773998260498, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8745126724243164, + "num_tokens": 475021734.0, + "step": 12448 + }, + { + "epoch": 1.5836407581732606, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5945695638656616, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.860782265663147, + "num_tokens": 475060430.0, + "step": 12449 + }, + { + "epoch": 1.5837679684518509, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.657549500465393, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8664594292640686, + "num_tokens": 475095446.0, + "step": 12450 + }, + { + "epoch": 1.5838951787304414, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5959125757217407, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8757646083831787, + "num_tokens": 475128994.0, + "step": 12451 + }, + { + "epoch": 1.584022389009032, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.53443443775177, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8783488869667053, + "num_tokens": 475168232.0, + "step": 12452 + }, + { + "epoch": 1.5841495992876224, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5638600587844849, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8783107995986938, + "num_tokens": 475205262.0, + "step": 12453 + }, + { + "epoch": 1.584276809566213, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6264296770095825, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8588082790374756, + "num_tokens": 475244630.0, + "step": 12454 + }, + { + "epoch": 1.5844040198448035, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.520819902420044, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8834263682365417, + "num_tokens": 475279589.0, + "step": 12455 + }, + { + "epoch": 1.5845312301233938, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5241215229034424, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8567023277282715, + "num_tokens": 475325177.0, + "step": 12456 + }, + { + "epoch": 1.5846584404019843, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5821993350982666, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8796443939208984, + "num_tokens": 475362052.0, + "step": 12457 + }, + { + "epoch": 1.5847856506805749, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6329350471496582, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8592947721481323, + "num_tokens": 475399421.0, + "step": 12458 + }, + { + "epoch": 1.5849128609591654, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5680993795394897, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8790265917778015, + "num_tokens": 475436396.0, + "step": 12459 + }, + { + "epoch": 1.585040071237756, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5388894081115723, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.881199836730957, + "num_tokens": 475471988.0, + "step": 12460 + }, + { + "epoch": 1.5851672815163464, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6417300701141357, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.865927517414093, + "num_tokens": 475508186.0, + "step": 12461 + }, + { + "epoch": 1.585294491794937, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6164510250091553, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8807730674743652, + "num_tokens": 475547361.0, + "step": 12462 + }, + { + "epoch": 1.5854217020735275, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8001583814620972, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8611358404159546, + "num_tokens": 475582873.0, + "step": 12463 + }, + { + "epoch": 1.585548912352118, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6579753160476685, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8792480826377869, + "num_tokens": 475621043.0, + "step": 12464 + }, + { + "epoch": 1.5856761226307086, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6861772537231445, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8693482875823975, + "num_tokens": 475658352.0, + "step": 12465 + }, + { + "epoch": 1.585803332909299, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5503458976745605, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8735112547874451, + "num_tokens": 475698632.0, + "step": 12466 + }, + { + "epoch": 1.5859305431878896, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5182511806488037, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8665916919708252, + "num_tokens": 475737928.0, + "step": 12467 + }, + { + "epoch": 1.5860577534664801, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5788992643356323, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8597116470336914, + "num_tokens": 475782086.0, + "step": 12468 + }, + { + "epoch": 1.5861849637450707, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8691086769104004, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8429659605026245, + "num_tokens": 475814292.0, + "step": 12469 + }, + { + "epoch": 1.5863121740236612, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4423309564590454, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8765243291854858, + "num_tokens": 475860694.0, + "step": 12470 + }, + { + "epoch": 1.5864393843022517, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 3.755908966064453, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.864862859249115, + "num_tokens": 475894476.0, + "step": 12471 + }, + { + "epoch": 1.5865665945808423, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.586643099784851, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8770914077758789, + "num_tokens": 475929305.0, + "step": 12472 + }, + { + "epoch": 1.5866938048594328, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.602799892425537, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8623183965682983, + "num_tokens": 475970128.0, + "step": 12473 + }, + { + "epoch": 1.5868210151380233, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4821033477783203, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.871712327003479, + "num_tokens": 476010653.0, + "step": 12474 + }, + { + "epoch": 1.5869482254166136, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4656867980957031, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.865726888179779, + "num_tokens": 476055492.0, + "step": 12475 + }, + { + "epoch": 1.5870754356952042, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5718683004379272, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8713549375534058, + "num_tokens": 476093060.0, + "step": 12476 + }, + { + "epoch": 1.5872026459737947, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7142739295959473, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8694987297058105, + "num_tokens": 476127507.0, + "step": 12477 + }, + { + "epoch": 1.5873298562523852, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.439698576927185, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.883487343788147, + "num_tokens": 476172227.0, + "step": 12478 + }, + { + "epoch": 1.5874570665309757, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4889636039733887, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8716806173324585, + "num_tokens": 476216504.0, + "step": 12479 + }, + { + "epoch": 1.5875842768095663, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6917619705200195, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8855372667312622, + "num_tokens": 476247612.0, + "step": 12480 + }, + { + "epoch": 1.5877114870881566, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7505085468292236, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8683289885520935, + "num_tokens": 476281304.0, + "step": 12481 + }, + { + "epoch": 1.587838697366747, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5714517831802368, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.864652156829834, + "num_tokens": 476321680.0, + "step": 12482 + }, + { + "epoch": 1.5879659076453376, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5517768859863281, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8714466691017151, + "num_tokens": 476359315.0, + "step": 12483 + }, + { + "epoch": 1.5880931179239282, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.544008731842041, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8794100880622864, + "num_tokens": 476399091.0, + "step": 12484 + }, + { + "epoch": 1.5882203282025187, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5191878080368042, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8625730872154236, + "num_tokens": 476441830.0, + "step": 12485 + }, + { + "epoch": 1.5883475384811092, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6302316188812256, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8546687364578247, + "num_tokens": 476482722.0, + "step": 12486 + }, + { + "epoch": 1.5884747487596997, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.542019009590149, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8602430820465088, + "num_tokens": 476523865.0, + "step": 12487 + }, + { + "epoch": 1.5886019590382903, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5691378116607666, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8951634168624878, + "num_tokens": 476559658.0, + "step": 12488 + }, + { + "epoch": 1.5887291693168808, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6841583251953125, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8744955062866211, + "num_tokens": 476599102.0, + "step": 12489 + }, + { + "epoch": 1.5888563795954713, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5540448427200317, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8525987863540649, + "num_tokens": 476642849.0, + "step": 12490 + }, + { + "epoch": 1.5889835898740619, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6338425874710083, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.862827718257904, + "num_tokens": 476678024.0, + "step": 12491 + }, + { + "epoch": 1.5891108001526524, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4622150659561157, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8841052055358887, + "num_tokens": 476722718.0, + "step": 12492 + }, + { + "epoch": 1.589238010431243, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5718673467636108, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8766919374465942, + "num_tokens": 476759510.0, + "step": 12493 + }, + { + "epoch": 1.5893652207098334, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6185582876205444, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.862777829170227, + "num_tokens": 476797962.0, + "step": 12494 + }, + { + "epoch": 1.589492430988424, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5864685773849487, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8739455342292786, + "num_tokens": 476836398.0, + "step": 12495 + }, + { + "epoch": 1.5896196412670145, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4752774238586426, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8753396272659302, + "num_tokens": 476878349.0, + "step": 12496 + }, + { + "epoch": 1.589746851545605, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5730949640274048, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8609849214553833, + "num_tokens": 476919397.0, + "step": 12497 + }, + { + "epoch": 1.5898740618241956, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6898016929626465, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8735123872756958, + "num_tokens": 476954943.0, + "step": 12498 + }, + { + "epoch": 1.5900012721027859, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4675472974777222, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8780824542045593, + "num_tokens": 476995033.0, + "step": 12499 + }, + { + "epoch": 1.5901284823813764, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5375008583068848, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8764110207557678, + "num_tokens": 477033150.0, + "step": 12500 + }, + { + "epoch": 1.590255692659967, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6586980819702148, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.85324627161026, + "num_tokens": 477072358.0, + "step": 12501 + }, + { + "epoch": 1.5903829029385574, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5450677871704102, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8883281946182251, + "num_tokens": 477108223.0, + "step": 12502 + }, + { + "epoch": 1.590510113217148, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8190709352493286, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8769056797027588, + "num_tokens": 477151271.0, + "step": 12503 + }, + { + "epoch": 1.5906373234957385, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6111652851104736, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.850824236869812, + "num_tokens": 477190951.0, + "step": 12504 + }, + { + "epoch": 1.5907645337743288, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5114235877990723, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8778810501098633, + "num_tokens": 477234527.0, + "step": 12505 + }, + { + "epoch": 1.5908917440529193, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6813921928405762, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8577588796615601, + "num_tokens": 477269523.0, + "step": 12506 + }, + { + "epoch": 1.5910189543315099, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6232386827468872, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8712079524993896, + "num_tokens": 477309582.0, + "step": 12507 + }, + { + "epoch": 1.5911461646101004, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5890357494354248, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8757561445236206, + "num_tokens": 477347120.0, + "step": 12508 + }, + { + "epoch": 1.591273374888691, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5984280109405518, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8730356097221375, + "num_tokens": 477386531.0, + "step": 12509 + }, + { + "epoch": 1.5914005851672814, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6180174350738525, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8654477000236511, + "num_tokens": 477426112.0, + "step": 12510 + }, + { + "epoch": 1.591527795445872, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6746854782104492, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8812686204910278, + "num_tokens": 477463089.0, + "step": 12511 + }, + { + "epoch": 1.5916550057244625, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4778027534484863, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8805144429206848, + "num_tokens": 477508520.0, + "step": 12512 + }, + { + "epoch": 1.591782216003053, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5583720207214355, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8752092123031616, + "num_tokens": 477548064.0, + "step": 12513 + }, + { + "epoch": 1.5919094262816436, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7267204523086548, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8452558517456055, + "num_tokens": 477583404.0, + "step": 12514 + }, + { + "epoch": 1.592036636560234, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7301689386367798, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8631552457809448, + "num_tokens": 477615809.0, + "step": 12515 + }, + { + "epoch": 1.5921638468388246, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6347219944000244, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8818231821060181, + "num_tokens": 477654101.0, + "step": 12516 + }, + { + "epoch": 1.5922910571174151, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.52169930934906, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8783243298530579, + "num_tokens": 477695569.0, + "step": 12517 + }, + { + "epoch": 1.5924182673960057, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.627021312713623, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8713743686676025, + "num_tokens": 477733566.0, + "step": 12518 + }, + { + "epoch": 1.5925454776745962, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6018891334533691, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8647059202194214, + "num_tokens": 477776486.0, + "step": 12519 + }, + { + "epoch": 1.5926726879531867, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6289403438568115, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8543844819068909, + "num_tokens": 477814885.0, + "step": 12520 + }, + { + "epoch": 1.5927998982317773, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6703591346740723, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8568964004516602, + "num_tokens": 477851959.0, + "step": 12521 + }, + { + "epoch": 1.5929271085103678, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6343077421188354, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8715860843658447, + "num_tokens": 477889446.0, + "step": 12522 + }, + { + "epoch": 1.5930543187889583, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6449934244155884, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8719554543495178, + "num_tokens": 477924791.0, + "step": 12523 + }, + { + "epoch": 1.5931815290675486, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4533298015594482, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8694344758987427, + "num_tokens": 477967821.0, + "step": 12524 + }, + { + "epoch": 1.5933087393461391, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6736847162246704, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.869907557964325, + "num_tokens": 478005664.0, + "step": 12525 + }, + { + "epoch": 1.5934359496247297, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4935410022735596, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8887112140655518, + "num_tokens": 478046352.0, + "step": 12526 + }, + { + "epoch": 1.5935631599033202, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5214598178863525, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8687732219696045, + "num_tokens": 478091447.0, + "step": 12527 + }, + { + "epoch": 1.5936903701819107, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5271515846252441, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8763860464096069, + "num_tokens": 478134165.0, + "step": 12528 + }, + { + "epoch": 1.5938175804605013, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5879364013671875, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8769944906234741, + "num_tokens": 478173308.0, + "step": 12529 + }, + { + "epoch": 1.5939447907390916, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5738003253936768, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8740061521530151, + "num_tokens": 478212949.0, + "step": 12530 + }, + { + "epoch": 1.594072001017682, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5708566904067993, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8812936544418335, + "num_tokens": 478249144.0, + "step": 12531 + }, + { + "epoch": 1.5941992112962726, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.61826753616333, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8648796081542969, + "num_tokens": 478288825.0, + "step": 12532 + }, + { + "epoch": 1.5943264215748632, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.623266577720642, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.860809862613678, + "num_tokens": 478325668.0, + "step": 12533 + }, + { + "epoch": 1.5944536318534537, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7444583177566528, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8557102680206299, + "num_tokens": 478360294.0, + "step": 12534 + }, + { + "epoch": 1.5945808421320442, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.49709951877594, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8768656849861145, + "num_tokens": 478402373.0, + "step": 12535 + }, + { + "epoch": 1.5947080524106347, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6544173955917358, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8697343468666077, + "num_tokens": 478439079.0, + "step": 12536 + }, + { + "epoch": 1.5948352626892253, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.594006896018982, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8775705099105835, + "num_tokens": 478476696.0, + "step": 12537 + }, + { + "epoch": 1.5949624729678158, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8157970905303955, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8569995164871216, + "num_tokens": 478507944.0, + "step": 12538 + }, + { + "epoch": 1.5950896832464063, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6072441339492798, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8610592484474182, + "num_tokens": 478548059.0, + "step": 12539 + }, + { + "epoch": 1.5952168935249968, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5547981262207031, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8651013374328613, + "num_tokens": 478592017.0, + "step": 12540 + }, + { + "epoch": 1.5953441038035874, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5855482816696167, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8772915601730347, + "num_tokens": 478627629.0, + "step": 12541 + }, + { + "epoch": 1.595471314082178, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5042712688446045, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8854013681411743, + "num_tokens": 478668008.0, + "step": 12542 + }, + { + "epoch": 1.5955985243607684, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5557842254638672, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8676609396934509, + "num_tokens": 478706550.0, + "step": 12543 + }, + { + "epoch": 1.595725734639359, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6798596382141113, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8621140718460083, + "num_tokens": 478743030.0, + "step": 12544 + }, + { + "epoch": 1.5958529449179495, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.9884902238845825, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8665533661842346, + "num_tokens": 478772779.0, + "step": 12545 + }, + { + "epoch": 1.59598015519654, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6583251953125, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8867393732070923, + "num_tokens": 478807663.0, + "step": 12546 + }, + { + "epoch": 1.5961073654751305, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4893980026245117, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.876246452331543, + "num_tokens": 478851759.0, + "step": 12547 + }, + { + "epoch": 1.5962345757537209, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.937103271484375, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8276423215866089, + "num_tokens": 478883292.0, + "step": 12548 + }, + { + "epoch": 1.5963617860323114, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6124480962753296, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.86574786901474, + "num_tokens": 478923766.0, + "step": 12549 + }, + { + "epoch": 1.596488996310902, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5344220399856567, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8650063276290894, + "num_tokens": 478964582.0, + "step": 12550 + }, + { + "epoch": 1.5966162065894924, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5934220552444458, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8787502646446228, + "num_tokens": 479003843.0, + "step": 12551 + }, + { + "epoch": 1.596743416868083, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.705471396446228, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8752678036689758, + "num_tokens": 479036895.0, + "step": 12552 + }, + { + "epoch": 1.5968706271466735, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.568321943283081, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8616601228713989, + "num_tokens": 479077042.0, + "step": 12553 + }, + { + "epoch": 1.5969978374252638, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5220012664794922, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8817697763442993, + "num_tokens": 479117646.0, + "step": 12554 + }, + { + "epoch": 1.5971250477038543, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7097631692886353, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8618984222412109, + "num_tokens": 479157627.0, + "step": 12555 + }, + { + "epoch": 1.5972522579824449, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7058979272842407, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8677546977996826, + "num_tokens": 479190840.0, + "step": 12556 + }, + { + "epoch": 1.5973794682610354, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6371746063232422, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8755708336830139, + "num_tokens": 479225258.0, + "step": 12557 + }, + { + "epoch": 1.597506678539626, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.764966607093811, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8701435923576355, + "num_tokens": 479259207.0, + "step": 12558 + }, + { + "epoch": 1.5976338888182164, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.511857032775879, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8629429340362549, + "num_tokens": 479302930.0, + "step": 12559 + }, + { + "epoch": 1.597761099096807, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.651965856552124, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8772215843200684, + "num_tokens": 479339628.0, + "step": 12560 + }, + { + "epoch": 1.5978883093753975, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6322214603424072, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8756899237632751, + "num_tokens": 479377832.0, + "step": 12561 + }, + { + "epoch": 1.598015519653988, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.652901291847229, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.873191237449646, + "num_tokens": 479415408.0, + "step": 12562 + }, + { + "epoch": 1.5981427299325786, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5981130599975586, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8702577352523804, + "num_tokens": 479452726.0, + "step": 12563 + }, + { + "epoch": 1.598269940211169, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7221983671188354, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8760783672332764, + "num_tokens": 479490072.0, + "step": 12564 + }, + { + "epoch": 1.5983971504897596, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5848658084869385, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8706493377685547, + "num_tokens": 479530058.0, + "step": 12565 + }, + { + "epoch": 1.5985243607683501, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6867516040802002, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8594050407409668, + "num_tokens": 479565903.0, + "step": 12566 + }, + { + "epoch": 1.5986515710469407, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5900652408599854, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8694486618041992, + "num_tokens": 479607227.0, + "step": 12567 + }, + { + "epoch": 1.5987787813255312, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6303255558013916, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8770496845245361, + "num_tokens": 479646775.0, + "step": 12568 + }, + { + "epoch": 1.5989059916041217, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5834037065505981, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8636651635169983, + "num_tokens": 479683873.0, + "step": 12569 + }, + { + "epoch": 1.5990332018827123, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5866566896438599, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.867201030254364, + "num_tokens": 479722780.0, + "step": 12570 + }, + { + "epoch": 1.5991604121613028, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.536745548248291, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8723820447921753, + "num_tokens": 479764801.0, + "step": 12571 + }, + { + "epoch": 1.5992876224398933, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.628986120223999, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8829007148742676, + "num_tokens": 479803245.0, + "step": 12572 + }, + { + "epoch": 1.5994148327184836, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7423162460327148, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8661218285560608, + "num_tokens": 479837776.0, + "step": 12573 + }, + { + "epoch": 1.5995420429970741, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.53594172000885, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.858663022518158, + "num_tokens": 479879743.0, + "step": 12574 + }, + { + "epoch": 1.5996692532756647, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5236523151397705, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8552831411361694, + "num_tokens": 479924779.0, + "step": 12575 + }, + { + "epoch": 1.5997964635542552, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5658234357833862, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8576592803001404, + "num_tokens": 479966156.0, + "step": 12576 + }, + { + "epoch": 1.5999236738328457, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.618708848953247, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8761378526687622, + "num_tokens": 479999707.0, + "step": 12577 + }, + { + "epoch": 1.6000508841114363, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5741736888885498, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8743870258331299, + "num_tokens": 480041505.0, + "step": 12578 + }, + { + "epoch": 1.6001780943900266, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.651038646697998, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8744735717773438, + "num_tokens": 480075231.0, + "step": 12579 + }, + { + "epoch": 1.600305304668617, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8583298921585083, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.857787549495697, + "num_tokens": 480106616.0, + "step": 12580 + }, + { + "epoch": 1.6004325149472076, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.542426586151123, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.871698260307312, + "num_tokens": 480146563.0, + "step": 12581 + }, + { + "epoch": 1.6005597252257981, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5092074871063232, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8821012377738953, + "num_tokens": 480186277.0, + "step": 12582 + }, + { + "epoch": 1.6006869355043887, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.501129388809204, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8700872659683228, + "num_tokens": 480229013.0, + "step": 12583 + }, + { + "epoch": 1.6008141457829792, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5357273817062378, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8700830936431885, + "num_tokens": 480269506.0, + "step": 12584 + }, + { + "epoch": 1.6009413560615697, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6812140941619873, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8652827143669128, + "num_tokens": 480302754.0, + "step": 12585 + }, + { + "epoch": 1.6010685663401603, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5612990856170654, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8694862127304077, + "num_tokens": 480343177.0, + "step": 12586 + }, + { + "epoch": 1.6011957766187508, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5526458024978638, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8825003504753113, + "num_tokens": 480379675.0, + "step": 12587 + }, + { + "epoch": 1.6013229868973413, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6340535879135132, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8653082847595215, + "num_tokens": 480416934.0, + "step": 12588 + }, + { + "epoch": 1.6014501971759318, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7678800821304321, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8678030967712402, + "num_tokens": 480454170.0, + "step": 12589 + }, + { + "epoch": 1.6015774074545224, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5616921186447144, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8698559999465942, + "num_tokens": 480495116.0, + "step": 12590 + }, + { + "epoch": 1.601704617733113, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5434952974319458, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8748904466629028, + "num_tokens": 480536285.0, + "step": 12591 + }, + { + "epoch": 1.6018318280117034, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6619789600372314, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8668270111083984, + "num_tokens": 480576161.0, + "step": 12592 + }, + { + "epoch": 1.601959038290294, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6176283359527588, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8718746900558472, + "num_tokens": 480612897.0, + "step": 12593 + }, + { + "epoch": 1.6020862485688845, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6074243783950806, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8589773774147034, + "num_tokens": 480652930.0, + "step": 12594 + }, + { + "epoch": 1.602213458847475, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5469657182693481, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8778150677680969, + "num_tokens": 480692439.0, + "step": 12595 + }, + { + "epoch": 1.6023406691260655, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6530898809432983, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8710026144981384, + "num_tokens": 480728889.0, + "step": 12596 + }, + { + "epoch": 1.6024678794046558, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6086465120315552, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8737617135047913, + "num_tokens": 480764900.0, + "step": 12597 + }, + { + "epoch": 1.6025950896832464, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6734336614608765, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8642048239707947, + "num_tokens": 480799264.0, + "step": 12598 + }, + { + "epoch": 1.602722299961837, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4730186462402344, + "learning_rate": 1e-06, + "loss": 0.3263, + "mean_token_accuracy": 0.8906207084655762, + "num_tokens": 480839191.0, + "step": 12599 + }, + { + "epoch": 1.6028495102404274, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.629725456237793, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8735358715057373, + "num_tokens": 480876195.0, + "step": 12600 + }, + { + "epoch": 1.602976720519018, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.669610857963562, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.85137540102005, + "num_tokens": 480914019.0, + "step": 12601 + }, + { + "epoch": 1.6031039307976085, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6962080001831055, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8722747564315796, + "num_tokens": 480949985.0, + "step": 12602 + }, + { + "epoch": 1.6032311410761988, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4822880029678345, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8821425437927246, + "num_tokens": 480991077.0, + "step": 12603 + }, + { + "epoch": 1.6033583513547893, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5518430471420288, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8587294220924377, + "num_tokens": 481036696.0, + "step": 12604 + }, + { + "epoch": 1.6034855616333799, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7006182670593262, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8685035109519958, + "num_tokens": 481071639.0, + "step": 12605 + }, + { + "epoch": 1.6036127719119704, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7415056228637695, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8512574434280396, + "num_tokens": 481110636.0, + "step": 12606 + }, + { + "epoch": 1.603739982190561, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.689610481262207, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8533790111541748, + "num_tokens": 481150923.0, + "step": 12607 + }, + { + "epoch": 1.6038671924691514, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6321768760681152, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8639366030693054, + "num_tokens": 481191791.0, + "step": 12608 + }, + { + "epoch": 1.603994402747742, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.562063217163086, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8688955307006836, + "num_tokens": 481236230.0, + "step": 12609 + }, + { + "epoch": 1.6041216130263325, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8890422582626343, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8625967502593994, + "num_tokens": 481270004.0, + "step": 12610 + }, + { + "epoch": 1.604248823304923, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6550817489624023, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8759413361549377, + "num_tokens": 481306483.0, + "step": 12611 + }, + { + "epoch": 1.6043760335835135, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6585365533828735, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8751130104064941, + "num_tokens": 481341308.0, + "step": 12612 + }, + { + "epoch": 1.604503243862104, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6448252201080322, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8814542293548584, + "num_tokens": 481373083.0, + "step": 12613 + }, + { + "epoch": 1.6046304541406946, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7049659490585327, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8702100515365601, + "num_tokens": 481410502.0, + "step": 12614 + }, + { + "epoch": 1.6047576644192851, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6392672061920166, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.872287392616272, + "num_tokens": 481446817.0, + "step": 12615 + }, + { + "epoch": 1.6048848746978757, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5732028484344482, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8795809745788574, + "num_tokens": 481486405.0, + "step": 12616 + }, + { + "epoch": 1.6050120849764662, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8058503866195679, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8657084703445435, + "num_tokens": 481524194.0, + "step": 12617 + }, + { + "epoch": 1.6051392952550567, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6079763174057007, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8608555793762207, + "num_tokens": 481567824.0, + "step": 12618 + }, + { + "epoch": 1.6052665055336472, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5991297960281372, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8672130107879639, + "num_tokens": 481607748.0, + "step": 12619 + }, + { + "epoch": 1.6053937158122378, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5051076412200928, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8670713901519775, + "num_tokens": 481649939.0, + "step": 12620 + }, + { + "epoch": 1.6055209260908283, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6028498411178589, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8754045367240906, + "num_tokens": 481687701.0, + "step": 12621 + }, + { + "epoch": 1.6056481363694186, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.671470284461975, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8747439980506897, + "num_tokens": 481722701.0, + "step": 12622 + }, + { + "epoch": 1.6057753466480091, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6663516759872437, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8773359656333923, + "num_tokens": 481759899.0, + "step": 12623 + }, + { + "epoch": 1.6059025569265997, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6325218677520752, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8590120077133179, + "num_tokens": 481796713.0, + "step": 12624 + }, + { + "epoch": 1.6060297672051902, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4859120845794678, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.867982029914856, + "num_tokens": 481839766.0, + "step": 12625 + }, + { + "epoch": 1.6061569774837807, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6581472158432007, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8837332129478455, + "num_tokens": 481874276.0, + "step": 12626 + }, + { + "epoch": 1.6062841877623713, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.561800479888916, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8590461015701294, + "num_tokens": 481917479.0, + "step": 12627 + }, + { + "epoch": 1.6064113980409616, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6045050621032715, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8717308044433594, + "num_tokens": 481959133.0, + "step": 12628 + }, + { + "epoch": 1.606538608319552, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5486758947372437, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8782815933227539, + "num_tokens": 481997085.0, + "step": 12629 + }, + { + "epoch": 1.6066658185981426, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5351694822311401, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8686795830726624, + "num_tokens": 482033383.0, + "step": 12630 + }, + { + "epoch": 1.6067930288767331, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.646449089050293, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8768067359924316, + "num_tokens": 482066926.0, + "step": 12631 + }, + { + "epoch": 1.6069202391553237, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4357191324234009, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8738096952438354, + "num_tokens": 482110538.0, + "step": 12632 + }, + { + "epoch": 1.6070474494339142, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4895646572113037, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8890446424484253, + "num_tokens": 482149064.0, + "step": 12633 + }, + { + "epoch": 1.6071746597125047, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7270681858062744, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8646008968353271, + "num_tokens": 482186931.0, + "step": 12634 + }, + { + "epoch": 1.6073018699910953, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7932844161987305, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8639270663261414, + "num_tokens": 482224491.0, + "step": 12635 + }, + { + "epoch": 1.6074290802696858, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.611112356185913, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8646007180213928, + "num_tokens": 482267014.0, + "step": 12636 + }, + { + "epoch": 1.6075562905482763, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.585384726524353, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8752748370170593, + "num_tokens": 482305659.0, + "step": 12637 + }, + { + "epoch": 1.6076835008268668, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6135989427566528, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8808912038803101, + "num_tokens": 482344186.0, + "step": 12638 + }, + { + "epoch": 1.6078107111054574, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.8066520690917969, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8615574836730957, + "num_tokens": 482377582.0, + "step": 12639 + }, + { + "epoch": 1.607937921384048, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.784029245376587, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8735236525535583, + "num_tokens": 482413495.0, + "step": 12640 + }, + { + "epoch": 1.6080651316626384, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6894816160202026, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8633671998977661, + "num_tokens": 482457090.0, + "step": 12641 + }, + { + "epoch": 1.608192341941229, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4264005422592163, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.88344407081604, + "num_tokens": 482503608.0, + "step": 12642 + }, + { + "epoch": 1.6083195522198195, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5167133808135986, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8745957016944885, + "num_tokens": 482543769.0, + "step": 12643 + }, + { + "epoch": 1.60844676249841, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6749932765960693, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8723054528236389, + "num_tokens": 482580925.0, + "step": 12644 + }, + { + "epoch": 1.6085739727770005, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.620095133781433, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8617260456085205, + "num_tokens": 482619627.0, + "step": 12645 + }, + { + "epoch": 1.6087011830555908, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7054513692855835, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8696161508560181, + "num_tokens": 482655572.0, + "step": 12646 + }, + { + "epoch": 1.6088283933341814, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7028764486312866, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8698429465293884, + "num_tokens": 482689568.0, + "step": 12647 + }, + { + "epoch": 1.608955603612772, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5788347721099854, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8732159733772278, + "num_tokens": 482727719.0, + "step": 12648 + }, + { + "epoch": 1.6090828138913624, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6378751993179321, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8788801431655884, + "num_tokens": 482763861.0, + "step": 12649 + }, + { + "epoch": 1.609210024169953, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6728218793869019, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8713682889938354, + "num_tokens": 482796929.0, + "step": 12650 + }, + { + "epoch": 1.6093372344485435, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5387303829193115, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8765588402748108, + "num_tokens": 482834898.0, + "step": 12651 + }, + { + "epoch": 1.6094644447271338, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6749053001403809, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8650499582290649, + "num_tokens": 482873605.0, + "step": 12652 + }, + { + "epoch": 1.6095916550057243, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6692259311676025, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8596917390823364, + "num_tokens": 482910112.0, + "step": 12653 + }, + { + "epoch": 1.6097188652843148, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.518477439880371, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8797396421432495, + "num_tokens": 482949682.0, + "step": 12654 + }, + { + "epoch": 1.6098460755629054, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.537401795387268, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8817182183265686, + "num_tokens": 482985081.0, + "step": 12655 + }, + { + "epoch": 1.609973285841496, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.604111909866333, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8916856646537781, + "num_tokens": 483019480.0, + "step": 12656 + }, + { + "epoch": 1.6101004961200864, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5878400802612305, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8755733966827393, + "num_tokens": 483055970.0, + "step": 12657 + }, + { + "epoch": 1.610227706398677, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7320159673690796, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.88182133436203, + "num_tokens": 483089870.0, + "step": 12658 + }, + { + "epoch": 1.6103549166772675, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6204807758331299, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8786293864250183, + "num_tokens": 483125164.0, + "step": 12659 + }, + { + "epoch": 1.610482126955858, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5939863920211792, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8756883144378662, + "num_tokens": 483164069.0, + "step": 12660 + }, + { + "epoch": 1.6106093372344485, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5474603176116943, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8819934129714966, + "num_tokens": 483205335.0, + "step": 12661 + }, + { + "epoch": 1.610736547513039, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.674360990524292, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8604552745819092, + "num_tokens": 483244185.0, + "step": 12662 + }, + { + "epoch": 1.6108637577916296, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.558079719543457, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8743395805358887, + "num_tokens": 483282054.0, + "step": 12663 + }, + { + "epoch": 1.6109909680702201, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6610769033432007, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8533017039299011, + "num_tokens": 483320993.0, + "step": 12664 + }, + { + "epoch": 1.6111181783488107, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6334463357925415, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.864189624786377, + "num_tokens": 483356724.0, + "step": 12665 + }, + { + "epoch": 1.6112453886274012, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.508150339126587, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.861328125, + "num_tokens": 483401262.0, + "step": 12666 + }, + { + "epoch": 1.6113725989059917, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4625365734100342, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8677554726600647, + "num_tokens": 483445263.0, + "step": 12667 + }, + { + "epoch": 1.6114998091845822, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5224336385726929, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8813872337341309, + "num_tokens": 483483159.0, + "step": 12668 + }, + { + "epoch": 1.6116270194631728, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5341932773590088, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8660672307014465, + "num_tokens": 483526932.0, + "step": 12669 + }, + { + "epoch": 1.6117542297417633, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7181577682495117, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8698889017105103, + "num_tokens": 483563658.0, + "step": 12670 + }, + { + "epoch": 1.6118814400203536, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6738736629486084, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8700187802314758, + "num_tokens": 483599856.0, + "step": 12671 + }, + { + "epoch": 1.6120086502989441, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.670827031135559, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8779100179672241, + "num_tokens": 483634884.0, + "step": 12672 + }, + { + "epoch": 1.6121358605775347, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.657474160194397, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8692665100097656, + "num_tokens": 483677616.0, + "step": 12673 + }, + { + "epoch": 1.6122630708561252, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6396194696426392, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8749428987503052, + "num_tokens": 483711369.0, + "step": 12674 + }, + { + "epoch": 1.6123902811347157, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.696515679359436, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8602204918861389, + "num_tokens": 483748947.0, + "step": 12675 + }, + { + "epoch": 1.6125174914133062, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.564833402633667, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8718352317810059, + "num_tokens": 483793391.0, + "step": 12676 + }, + { + "epoch": 1.6126447016918966, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5621914863586426, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.875862717628479, + "num_tokens": 483834322.0, + "step": 12677 + }, + { + "epoch": 1.612771911970487, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6247612237930298, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8800255060195923, + "num_tokens": 483866912.0, + "step": 12678 + }, + { + "epoch": 1.6128991222490776, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6481748819351196, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8699483275413513, + "num_tokens": 483904782.0, + "step": 12679 + }, + { + "epoch": 1.6130263325276681, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.578331470489502, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.871222734451294, + "num_tokens": 483945033.0, + "step": 12680 + }, + { + "epoch": 1.6131535428062587, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6424469947814941, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8739548921585083, + "num_tokens": 483981251.0, + "step": 12681 + }, + { + "epoch": 1.6132807530848492, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.564022183418274, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8647581338882446, + "num_tokens": 484021501.0, + "step": 12682 + }, + { + "epoch": 1.6134079633634397, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 3.6851978302001953, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8652776479721069, + "num_tokens": 484062272.0, + "step": 12683 + }, + { + "epoch": 1.6135351736420303, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5512961149215698, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8904213905334473, + "num_tokens": 484100440.0, + "step": 12684 + }, + { + "epoch": 1.6136623839206208, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.676684021949768, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8526037931442261, + "num_tokens": 484137163.0, + "step": 12685 + }, + { + "epoch": 1.6137895941992113, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4752558469772339, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8877106308937073, + "num_tokens": 484173243.0, + "step": 12686 + }, + { + "epoch": 1.6139168044778018, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5011500120162964, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8729846477508545, + "num_tokens": 484217244.0, + "step": 12687 + }, + { + "epoch": 1.6140440147563924, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6149771213531494, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8650363683700562, + "num_tokens": 484255435.0, + "step": 12688 + }, + { + "epoch": 1.614171225034983, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6816157102584839, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.871794581413269, + "num_tokens": 484289582.0, + "step": 12689 + }, + { + "epoch": 1.6142984353135734, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6942813396453857, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8826991319656372, + "num_tokens": 484324360.0, + "step": 12690 + }, + { + "epoch": 1.614425645592164, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5163456201553345, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8770433664321899, + "num_tokens": 484369302.0, + "step": 12691 + }, + { + "epoch": 1.6145528558707545, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6997134685516357, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8740348815917969, + "num_tokens": 484406591.0, + "step": 12692 + }, + { + "epoch": 1.614680066149345, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4359240531921387, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8753724098205566, + "num_tokens": 484449864.0, + "step": 12693 + }, + { + "epoch": 1.6148072764279355, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6609371900558472, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8838032484054565, + "num_tokens": 484485261.0, + "step": 12694 + }, + { + "epoch": 1.6149344867065258, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6024457216262817, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8732500076293945, + "num_tokens": 484525187.0, + "step": 12695 + }, + { + "epoch": 1.6150616969851164, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5452543497085571, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8621739745140076, + "num_tokens": 484570427.0, + "step": 12696 + }, + { + "epoch": 1.615188907263707, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5074009895324707, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.885855495929718, + "num_tokens": 484607399.0, + "step": 12697 + }, + { + "epoch": 1.6153161175422974, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4479176998138428, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8824944496154785, + "num_tokens": 484648355.0, + "step": 12698 + }, + { + "epoch": 1.615443327820888, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5629733800888062, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8489022850990295, + "num_tokens": 484689898.0, + "step": 12699 + }, + { + "epoch": 1.6155705380994785, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6893645524978638, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8756619095802307, + "num_tokens": 484727565.0, + "step": 12700 + }, + { + "epoch": 1.6156977483780688, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5418707132339478, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.873665452003479, + "num_tokens": 484767847.0, + "step": 12701 + }, + { + "epoch": 1.6158249586566593, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6534538269042969, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8726900219917297, + "num_tokens": 484804243.0, + "step": 12702 + }, + { + "epoch": 1.6159521689352498, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5459990501403809, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8706834316253662, + "num_tokens": 484845686.0, + "step": 12703 + }, + { + "epoch": 1.6160793792138404, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5321730375289917, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8669764995574951, + "num_tokens": 484886989.0, + "step": 12704 + }, + { + "epoch": 1.616206589492431, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6710914373397827, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8564134240150452, + "num_tokens": 484925182.0, + "step": 12705 + }, + { + "epoch": 1.6163337997710214, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.483018159866333, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8739566802978516, + "num_tokens": 484964325.0, + "step": 12706 + }, + { + "epoch": 1.616461010049612, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6575326919555664, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8780844211578369, + "num_tokens": 485000216.0, + "step": 12707 + }, + { + "epoch": 1.6165882203282025, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.61690354347229, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8629969358444214, + "num_tokens": 485038203.0, + "step": 12708 + }, + { + "epoch": 1.616715430606793, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6771749258041382, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8771328926086426, + "num_tokens": 485070938.0, + "step": 12709 + }, + { + "epoch": 1.6168426408853835, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6763932704925537, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8466829061508179, + "num_tokens": 485111468.0, + "step": 12710 + }, + { + "epoch": 1.616969851163974, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6506826877593994, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8569175601005554, + "num_tokens": 485147646.0, + "step": 12711 + }, + { + "epoch": 1.6170970614425646, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5576525926589966, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8696876764297485, + "num_tokens": 485186904.0, + "step": 12712 + }, + { + "epoch": 1.6172242717211551, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6291775703430176, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8873147964477539, + "num_tokens": 485222111.0, + "step": 12713 + }, + { + "epoch": 1.6173514819997457, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6160253286361694, + "learning_rate": 1e-06, + "loss": 0.3202, + "mean_token_accuracy": 0.8981945514678955, + "num_tokens": 485256385.0, + "step": 12714 + }, + { + "epoch": 1.6174786922783362, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5916584730148315, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8708316087722778, + "num_tokens": 485297181.0, + "step": 12715 + }, + { + "epoch": 1.6176059025569267, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6063019037246704, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8605221509933472, + "num_tokens": 485335786.0, + "step": 12716 + }, + { + "epoch": 1.6177331128355172, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6561219692230225, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8594314455986023, + "num_tokens": 485374172.0, + "step": 12717 + }, + { + "epoch": 1.6178603231141078, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.547070026397705, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8574528694152832, + "num_tokens": 485416148.0, + "step": 12718 + }, + { + "epoch": 1.6179875333926983, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7424525022506714, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8568106889724731, + "num_tokens": 485451965.0, + "step": 12719 + }, + { + "epoch": 1.6181147436712886, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.415889024734497, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8673763275146484, + "num_tokens": 485498503.0, + "step": 12720 + }, + { + "epoch": 1.6182419539498791, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5380494594573975, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.872078001499176, + "num_tokens": 485538692.0, + "step": 12721 + }, + { + "epoch": 1.6183691642284697, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.700519323348999, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8633478283882141, + "num_tokens": 485578135.0, + "step": 12722 + }, + { + "epoch": 1.6184963745070602, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6203765869140625, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8893986940383911, + "num_tokens": 485609880.0, + "step": 12723 + }, + { + "epoch": 1.6186235847856507, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7928158044815063, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8565686941146851, + "num_tokens": 485642218.0, + "step": 12724 + }, + { + "epoch": 1.6187507950642412, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5708870887756348, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8725566864013672, + "num_tokens": 485684610.0, + "step": 12725 + }, + { + "epoch": 1.6188780053428315, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5380631685256958, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8665934801101685, + "num_tokens": 485726178.0, + "step": 12726 + }, + { + "epoch": 1.619005215621422, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.476270318031311, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8721157312393188, + "num_tokens": 485769214.0, + "step": 12727 + }, + { + "epoch": 1.6191324259000126, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6429861783981323, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8733701109886169, + "num_tokens": 485804900.0, + "step": 12728 + }, + { + "epoch": 1.6192596361786031, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5911364555358887, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8845193386077881, + "num_tokens": 485842289.0, + "step": 12729 + }, + { + "epoch": 1.6193868464571937, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6394399404525757, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8688908219337463, + "num_tokens": 485879496.0, + "step": 12730 + }, + { + "epoch": 1.6195140567357842, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6606818437576294, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8753466606140137, + "num_tokens": 485913844.0, + "step": 12731 + }, + { + "epoch": 1.6196412670143747, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.7643736600875854, + "learning_rate": 1e-06, + "loss": 0.3065, + "mean_token_accuracy": 0.8980216979980469, + "num_tokens": 485950361.0, + "step": 12732 + }, + { + "epoch": 1.6197684772929652, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.650235891342163, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.865503191947937, + "num_tokens": 485987611.0, + "step": 12733 + }, + { + "epoch": 1.6198956875715558, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6727255582809448, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8663381338119507, + "num_tokens": 486028010.0, + "step": 12734 + }, + { + "epoch": 1.6200228978501463, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6788111925125122, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8591570854187012, + "num_tokens": 486065140.0, + "step": 12735 + }, + { + "epoch": 1.6201501081287368, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5563631057739258, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.873755931854248, + "num_tokens": 486103620.0, + "step": 12736 + }, + { + "epoch": 1.6202773184073274, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6384031772613525, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8652176260948181, + "num_tokens": 486138477.0, + "step": 12737 + }, + { + "epoch": 1.6204045286859179, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.8274157047271729, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.866590678691864, + "num_tokens": 486176769.0, + "step": 12738 + }, + { + "epoch": 1.6205317389645084, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.9088815450668335, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8452546000480652, + "num_tokens": 486211389.0, + "step": 12739 + }, + { + "epoch": 1.620658949243099, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7411226034164429, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8821326494216919, + "num_tokens": 486247435.0, + "step": 12740 + }, + { + "epoch": 1.6207861595216895, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5710889101028442, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8531312346458435, + "num_tokens": 486293329.0, + "step": 12741 + }, + { + "epoch": 1.62091336980028, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.641218662261963, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8838398456573486, + "num_tokens": 486328659.0, + "step": 12742 + }, + { + "epoch": 1.6210405800788705, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.631696343421936, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8733401894569397, + "num_tokens": 486364594.0, + "step": 12743 + }, + { + "epoch": 1.6211677903574608, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7483946084976196, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8714398145675659, + "num_tokens": 486399809.0, + "step": 12744 + }, + { + "epoch": 1.6212950006360514, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.681524634361267, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8756499290466309, + "num_tokens": 486433261.0, + "step": 12745 + }, + { + "epoch": 1.621422210914642, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6555763483047485, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8700070381164551, + "num_tokens": 486472413.0, + "step": 12746 + }, + { + "epoch": 1.6215494211932324, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5030734539031982, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.875504732131958, + "num_tokens": 486513446.0, + "step": 12747 + }, + { + "epoch": 1.621676631471823, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.676901936531067, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8623030781745911, + "num_tokens": 486551089.0, + "step": 12748 + }, + { + "epoch": 1.6218038417504135, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6304982900619507, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8596003651618958, + "num_tokens": 486589252.0, + "step": 12749 + }, + { + "epoch": 1.6219310520290038, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5271373987197876, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8723443746566772, + "num_tokens": 486630516.0, + "step": 12750 + }, + { + "epoch": 1.6220582623075943, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.4983083009719849, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8712314963340759, + "num_tokens": 486672184.0, + "step": 12751 + }, + { + "epoch": 1.6221854725861848, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.573648452758789, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8652609586715698, + "num_tokens": 486714877.0, + "step": 12752 + }, + { + "epoch": 1.6223126828647754, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6682231426239014, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8785190582275391, + "num_tokens": 486750455.0, + "step": 12753 + }, + { + "epoch": 1.622439893143366, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6563184261322021, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8598853945732117, + "num_tokens": 486786645.0, + "step": 12754 + }, + { + "epoch": 1.6225671034219564, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6467790603637695, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8831210136413574, + "num_tokens": 486823024.0, + "step": 12755 + }, + { + "epoch": 1.622694313700547, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6896042823791504, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.875061571598053, + "num_tokens": 486860827.0, + "step": 12756 + }, + { + "epoch": 1.6228215239791375, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.654505729675293, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8701601028442383, + "num_tokens": 486895747.0, + "step": 12757 + }, + { + "epoch": 1.622948734257728, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.5027638673782349, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8715146780014038, + "num_tokens": 486940943.0, + "step": 12758 + }, + { + "epoch": 1.6230759445363185, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.614734172821045, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8792241811752319, + "num_tokens": 486975436.0, + "step": 12759 + }, + { + "epoch": 1.623203154814909, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.6460293531417847, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8753883838653564, + "num_tokens": 487009916.0, + "step": 12760 + }, + { + "epoch": 1.6233303650934996, + "ewc_loss": 2.372264862060547e-05, + "grad_norm": 1.646467685699463, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.873275637626648, + "num_tokens": 487045868.0, + "step": 12761 + }, + { + "epoch": 1.6234575753720901, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6470298767089844, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.861536979675293, + "num_tokens": 487085898.0, + "step": 12762 + }, + { + "epoch": 1.6235847856506807, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6109377145767212, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.865477442741394, + "num_tokens": 487129228.0, + "step": 12763 + }, + { + "epoch": 1.6237119959292712, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.633200764656067, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8718811273574829, + "num_tokens": 487167737.0, + "step": 12764 + }, + { + "epoch": 1.6238392062078617, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6924012899398804, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8669443130493164, + "num_tokens": 487202572.0, + "step": 12765 + }, + { + "epoch": 1.6239664164864522, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6226580142974854, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8731063008308411, + "num_tokens": 487238519.0, + "step": 12766 + }, + { + "epoch": 1.6240936267650428, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5540618896484375, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8843876719474792, + "num_tokens": 487278636.0, + "step": 12767 + }, + { + "epoch": 1.6242208370436333, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4878267049789429, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8820536136627197, + "num_tokens": 487318650.0, + "step": 12768 + }, + { + "epoch": 1.6243480473222236, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.586660623550415, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8819054961204529, + "num_tokens": 487352006.0, + "step": 12769 + }, + { + "epoch": 1.6244752576008141, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.703221321105957, + "learning_rate": 1e-06, + "loss": 0.332, + "mean_token_accuracy": 0.8899304866790771, + "num_tokens": 487382768.0, + "step": 12770 + }, + { + "epoch": 1.6246024678794047, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.57229483127594, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8697813749313354, + "num_tokens": 487423736.0, + "step": 12771 + }, + { + "epoch": 1.6247296781579952, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5722737312316895, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8605008125305176, + "num_tokens": 487464631.0, + "step": 12772 + }, + { + "epoch": 1.6248568884365857, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5077695846557617, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8796966671943665, + "num_tokens": 487501901.0, + "step": 12773 + }, + { + "epoch": 1.6249840987151762, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5235493183135986, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.878035306930542, + "num_tokens": 487538712.0, + "step": 12774 + }, + { + "epoch": 1.6251113089937665, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4858283996582031, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8814907073974609, + "num_tokens": 487579170.0, + "step": 12775 + }, + { + "epoch": 1.625238519272357, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5510920286178589, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8707002401351929, + "num_tokens": 487618511.0, + "step": 12776 + }, + { + "epoch": 1.6253657295509476, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4852750301361084, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.880801796913147, + "num_tokens": 487656625.0, + "step": 12777 + }, + { + "epoch": 1.6254929398295381, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.681579351425171, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8620280623435974, + "num_tokens": 487689857.0, + "step": 12778 + }, + { + "epoch": 1.6256201501081287, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6556118726730347, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8731719255447388, + "num_tokens": 487733626.0, + "step": 12779 + }, + { + "epoch": 1.6257473603867192, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6026639938354492, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8609361052513123, + "num_tokens": 487771895.0, + "step": 12780 + }, + { + "epoch": 1.6258745706653097, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7193635702133179, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8615161180496216, + "num_tokens": 487808603.0, + "step": 12781 + }, + { + "epoch": 1.6260017809439002, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4498165845870972, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8852335214614868, + "num_tokens": 487851347.0, + "step": 12782 + }, + { + "epoch": 1.6261289912224908, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5701619386672974, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8609859347343445, + "num_tokens": 487892783.0, + "step": 12783 + }, + { + "epoch": 1.6262562015010813, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.8403079509735107, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.858505129814148, + "num_tokens": 487929258.0, + "step": 12784 + }, + { + "epoch": 1.6263834117796718, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7559106349945068, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8648882508277893, + "num_tokens": 487966398.0, + "step": 12785 + }, + { + "epoch": 1.6265106220582624, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5866527557373047, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8677623271942139, + "num_tokens": 488007166.0, + "step": 12786 + }, + { + "epoch": 1.6266378323368529, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6239712238311768, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8716424703598022, + "num_tokens": 488043144.0, + "step": 12787 + }, + { + "epoch": 1.6267650426154434, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4967741966247559, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8678030967712402, + "num_tokens": 488083482.0, + "step": 12788 + }, + { + "epoch": 1.626892252894034, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.582373857498169, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8709971904754639, + "num_tokens": 488123319.0, + "step": 12789 + }, + { + "epoch": 1.6270194631726245, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.633041501045227, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8835173845291138, + "num_tokens": 488158366.0, + "step": 12790 + }, + { + "epoch": 1.627146673451215, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7026233673095703, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8443622589111328, + "num_tokens": 488199695.0, + "step": 12791 + }, + { + "epoch": 1.6272738837298055, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5534899234771729, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.868282675743103, + "num_tokens": 488245910.0, + "step": 12792 + }, + { + "epoch": 1.6274010940083958, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.751865029335022, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8842824697494507, + "num_tokens": 488273813.0, + "step": 12793 + }, + { + "epoch": 1.6275283042869864, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6063597202301025, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8692944049835205, + "num_tokens": 488310978.0, + "step": 12794 + }, + { + "epoch": 1.6276555145655769, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6554371118545532, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8827809691429138, + "num_tokens": 488346614.0, + "step": 12795 + }, + { + "epoch": 1.6277827248441674, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6320538520812988, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8659087419509888, + "num_tokens": 488387434.0, + "step": 12796 + }, + { + "epoch": 1.627909935122758, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.761956810951233, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8723426461219788, + "num_tokens": 488422139.0, + "step": 12797 + }, + { + "epoch": 1.6280371454013485, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6800081729888916, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8624902367591858, + "num_tokens": 488457415.0, + "step": 12798 + }, + { + "epoch": 1.6281643556799388, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.8748514652252197, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8557987213134766, + "num_tokens": 488486503.0, + "step": 12799 + }, + { + "epoch": 1.6282915659585293, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6339232921600342, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8753540515899658, + "num_tokens": 488522136.0, + "step": 12800 + }, + { + "epoch": 1.6284187762371198, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6460458040237427, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8748632669448853, + "num_tokens": 488559827.0, + "step": 12801 + }, + { + "epoch": 1.6285459865157104, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6017926931381226, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8640241026878357, + "num_tokens": 488598723.0, + "step": 12802 + }, + { + "epoch": 1.628673196794301, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.586719036102295, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8738207221031189, + "num_tokens": 488637160.0, + "step": 12803 + }, + { + "epoch": 1.6288004070728914, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6547017097473145, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8805539608001709, + "num_tokens": 488671137.0, + "step": 12804 + }, + { + "epoch": 1.628927617351482, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.750727891921997, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8668472766876221, + "num_tokens": 488701891.0, + "step": 12805 + }, + { + "epoch": 1.6290548276300725, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6042253971099854, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8726567029953003, + "num_tokens": 488738591.0, + "step": 12806 + }, + { + "epoch": 1.629182037908663, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5407841205596924, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8817766308784485, + "num_tokens": 488776246.0, + "step": 12807 + }, + { + "epoch": 1.6293092481872535, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.532363772392273, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8708608150482178, + "num_tokens": 488816448.0, + "step": 12808 + }, + { + "epoch": 1.629436458465844, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6992276906967163, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8715636730194092, + "num_tokens": 488851755.0, + "step": 12809 + }, + { + "epoch": 1.6295636687444346, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7638341188430786, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8719222545623779, + "num_tokens": 488886712.0, + "step": 12810 + }, + { + "epoch": 1.6296908790230251, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6024665832519531, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8550211191177368, + "num_tokens": 488928024.0, + "step": 12811 + }, + { + "epoch": 1.6298180893016156, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5369588136672974, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8757756948471069, + "num_tokens": 488966525.0, + "step": 12812 + }, + { + "epoch": 1.6299452995802062, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6867483854293823, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8780311346054077, + "num_tokens": 489002470.0, + "step": 12813 + }, + { + "epoch": 1.6300725098587967, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6095346212387085, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.87440025806427, + "num_tokens": 489040006.0, + "step": 12814 + }, + { + "epoch": 1.6301997201373872, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5871381759643555, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8813498020172119, + "num_tokens": 489079339.0, + "step": 12815 + }, + { + "epoch": 1.6303269304159778, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4831916093826294, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.877140998840332, + "num_tokens": 489121104.0, + "step": 12816 + }, + { + "epoch": 1.630454140694568, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5552138090133667, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8744139671325684, + "num_tokens": 489161654.0, + "step": 12817 + }, + { + "epoch": 1.6305813509731586, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6072720289230347, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8706940412521362, + "num_tokens": 489200035.0, + "step": 12818 + }, + { + "epoch": 1.6307085612517491, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5621265172958374, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8831924796104431, + "num_tokens": 489235471.0, + "step": 12819 + }, + { + "epoch": 1.6308357715303397, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5840816497802734, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8849679231643677, + "num_tokens": 489273790.0, + "step": 12820 + }, + { + "epoch": 1.6309629818089302, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7090551853179932, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8615105152130127, + "num_tokens": 489310794.0, + "step": 12821 + }, + { + "epoch": 1.6310901920875207, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5053269863128662, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8665422797203064, + "num_tokens": 489363590.0, + "step": 12822 + }, + { + "epoch": 1.6312174023661112, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.497041940689087, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8694210052490234, + "num_tokens": 489403860.0, + "step": 12823 + }, + { + "epoch": 1.6313446126447015, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4438661336898804, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8760435581207275, + "num_tokens": 489449324.0, + "step": 12824 + }, + { + "epoch": 1.631471822923292, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5809470415115356, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8662082552909851, + "num_tokens": 489485670.0, + "step": 12825 + }, + { + "epoch": 1.6315990332018826, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5029938220977783, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8673774003982544, + "num_tokens": 489528971.0, + "step": 12826 + }, + { + "epoch": 1.6317262434804731, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5344462394714355, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.872125506401062, + "num_tokens": 489566247.0, + "step": 12827 + }, + { + "epoch": 1.6318534537590637, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6053587198257446, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8686230778694153, + "num_tokens": 489602494.0, + "step": 12828 + }, + { + "epoch": 1.6319806640376542, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5560369491577148, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8676734566688538, + "num_tokens": 489641883.0, + "step": 12829 + }, + { + "epoch": 1.6321078743162447, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.584683895111084, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8640797734260559, + "num_tokens": 489682096.0, + "step": 12830 + }, + { + "epoch": 1.6322350845948352, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5594687461853027, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8685482740402222, + "num_tokens": 489720609.0, + "step": 12831 + }, + { + "epoch": 1.6323622948734258, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.550041913986206, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8570076823234558, + "num_tokens": 489762820.0, + "step": 12832 + }, + { + "epoch": 1.6324895051520163, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5520553588867188, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8755710124969482, + "num_tokens": 489803163.0, + "step": 12833 + }, + { + "epoch": 1.6326167154306068, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6495739221572876, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8818437457084656, + "num_tokens": 489839893.0, + "step": 12834 + }, + { + "epoch": 1.6327439257091974, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6402748823165894, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8535243272781372, + "num_tokens": 489878572.0, + "step": 12835 + }, + { + "epoch": 1.6328711359877879, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.730782389640808, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.858906626701355, + "num_tokens": 489912910.0, + "step": 12836 + }, + { + "epoch": 1.6329983462663784, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4878959655761719, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8690470457077026, + "num_tokens": 489954212.0, + "step": 12837 + }, + { + "epoch": 1.633125556544969, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6958355903625488, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.87532639503479, + "num_tokens": 489987983.0, + "step": 12838 + }, + { + "epoch": 1.6332527668235595, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.517958164215088, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8727044463157654, + "num_tokens": 490028652.0, + "step": 12839 + }, + { + "epoch": 1.63337997710215, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5082148313522339, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8707340955734253, + "num_tokens": 490072933.0, + "step": 12840 + }, + { + "epoch": 1.6335071873807405, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.714928150177002, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8643804788589478, + "num_tokens": 490107421.0, + "step": 12841 + }, + { + "epoch": 1.6336343976593308, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.770336627960205, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8519985675811768, + "num_tokens": 490147999.0, + "step": 12842 + }, + { + "epoch": 1.6337616079379214, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7524068355560303, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8739727735519409, + "num_tokens": 490183428.0, + "step": 12843 + }, + { + "epoch": 1.6338888182165119, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6086000204086304, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8700322508811951, + "num_tokens": 490224447.0, + "step": 12844 + }, + { + "epoch": 1.6340160284951024, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6742254495620728, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.875002920627594, + "num_tokens": 490258182.0, + "step": 12845 + }, + { + "epoch": 1.634143238773693, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.4972323179244995, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8683925271034241, + "num_tokens": 490300595.0, + "step": 12846 + }, + { + "epoch": 1.6342704490522835, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6751041412353516, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8778387904167175, + "num_tokens": 490338797.0, + "step": 12847 + }, + { + "epoch": 1.6343976593308738, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5800082683563232, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8705930113792419, + "num_tokens": 490380536.0, + "step": 12848 + }, + { + "epoch": 1.6345248696094643, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7873814105987549, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8664844036102295, + "num_tokens": 490419662.0, + "step": 12849 + }, + { + "epoch": 1.6346520798880548, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.624993085861206, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8718870282173157, + "num_tokens": 490455353.0, + "step": 12850 + }, + { + "epoch": 1.6347792901666454, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6117894649505615, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.880122184753418, + "num_tokens": 490491156.0, + "step": 12851 + }, + { + "epoch": 1.6349065004452359, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6353750228881836, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8710655570030212, + "num_tokens": 490528625.0, + "step": 12852 + }, + { + "epoch": 1.6350337107238264, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6212360858917236, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8697800636291504, + "num_tokens": 490567682.0, + "step": 12853 + }, + { + "epoch": 1.635160921002417, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.583734154701233, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8670434951782227, + "num_tokens": 490606930.0, + "step": 12854 + }, + { + "epoch": 1.6352881312810075, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6812152862548828, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.866210401058197, + "num_tokens": 490643094.0, + "step": 12855 + }, + { + "epoch": 1.635415341559598, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.8336255550384521, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8654152154922485, + "num_tokens": 490683282.0, + "step": 12856 + }, + { + "epoch": 1.6355425518381885, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7501583099365234, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8759464025497437, + "num_tokens": 490715983.0, + "step": 12857 + }, + { + "epoch": 1.635669762116779, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7209713459014893, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8579244613647461, + "num_tokens": 490753437.0, + "step": 12858 + }, + { + "epoch": 1.6357969723953696, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7075819969177246, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8581596612930298, + "num_tokens": 490789817.0, + "step": 12859 + }, + { + "epoch": 1.6359241826739601, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6886900663375854, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8722773790359497, + "num_tokens": 490825832.0, + "step": 12860 + }, + { + "epoch": 1.6360513929525506, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5968753099441528, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8715345859527588, + "num_tokens": 490860793.0, + "step": 12861 + }, + { + "epoch": 1.6361786032311412, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7257195711135864, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8615634441375732, + "num_tokens": 490895455.0, + "step": 12862 + }, + { + "epoch": 1.6363058135097317, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6655081510543823, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.870137095451355, + "num_tokens": 490933957.0, + "step": 12863 + }, + { + "epoch": 1.6364330237883222, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6070104837417603, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8853503465652466, + "num_tokens": 490965206.0, + "step": 12864 + }, + { + "epoch": 1.6365602340669128, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7287912368774414, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8769397735595703, + "num_tokens": 490996007.0, + "step": 12865 + }, + { + "epoch": 1.636687444345503, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6664327383041382, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8664078712463379, + "num_tokens": 491035902.0, + "step": 12866 + }, + { + "epoch": 1.6368146546240936, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6792489290237427, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8803547620773315, + "num_tokens": 491074389.0, + "step": 12867 + }, + { + "epoch": 1.6369418649026841, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.613731026649475, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8799405694007874, + "num_tokens": 491113066.0, + "step": 12868 + }, + { + "epoch": 1.6370690751812746, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6138005256652832, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8713165521621704, + "num_tokens": 491149612.0, + "step": 12869 + }, + { + "epoch": 1.6371962854598652, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5992190837860107, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8455649614334106, + "num_tokens": 491193445.0, + "step": 12870 + }, + { + "epoch": 1.6373234957384557, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6503621339797974, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8828895092010498, + "num_tokens": 491232279.0, + "step": 12871 + }, + { + "epoch": 1.6374507060170462, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7959940433502197, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8703476786613464, + "num_tokens": 491268590.0, + "step": 12872 + }, + { + "epoch": 1.6375779162956365, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7162355184555054, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.870261013507843, + "num_tokens": 491309717.0, + "step": 12873 + }, + { + "epoch": 1.637705126574227, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5908210277557373, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8687758445739746, + "num_tokens": 491350176.0, + "step": 12874 + }, + { + "epoch": 1.6378323368528176, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6472951173782349, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8573933839797974, + "num_tokens": 491385632.0, + "step": 12875 + }, + { + "epoch": 1.6379595471314081, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.591141939163208, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8822610378265381, + "num_tokens": 491423527.0, + "step": 12876 + }, + { + "epoch": 1.6380867574099987, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7659330368041992, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8783938884735107, + "num_tokens": 491452568.0, + "step": 12877 + }, + { + "epoch": 1.6382139676885892, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.7336454391479492, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.875907301902771, + "num_tokens": 491490294.0, + "step": 12878 + }, + { + "epoch": 1.6383411779671797, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.68600332736969, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8661223649978638, + "num_tokens": 491529645.0, + "step": 12879 + }, + { + "epoch": 1.6384683882457702, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.6691913604736328, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8682980537414551, + "num_tokens": 491561915.0, + "step": 12880 + }, + { + "epoch": 1.6385955985243608, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.5454915761947632, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8645766377449036, + "num_tokens": 491602897.0, + "step": 12881 + }, + { + "epoch": 1.6387228088029513, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7308838367462158, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8714981079101562, + "num_tokens": 491642128.0, + "step": 12882 + }, + { + "epoch": 1.6388500190815418, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5968505144119263, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8891459703445435, + "num_tokens": 491680093.0, + "step": 12883 + }, + { + "epoch": 1.6389772293601323, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7480974197387695, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.873701810836792, + "num_tokens": 491713503.0, + "step": 12884 + }, + { + "epoch": 1.6391044396387229, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.563563585281372, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8570060133934021, + "num_tokens": 491756047.0, + "step": 12885 + }, + { + "epoch": 1.6392316499173134, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5772947072982788, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8720366954803467, + "num_tokens": 491800904.0, + "step": 12886 + }, + { + "epoch": 1.639358860195904, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7440471649169922, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8555130958557129, + "num_tokens": 491836520.0, + "step": 12887 + }, + { + "epoch": 1.6394860704744945, + "ewc_loss": 2.396106719970703e-05, + "grad_norm": 1.756846308708191, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8558847308158875, + "num_tokens": 491871440.0, + "step": 12888 + }, + { + "epoch": 1.639613280753085, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6635820865631104, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8779054880142212, + "num_tokens": 491908874.0, + "step": 12889 + }, + { + "epoch": 1.6397404910316755, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5948419570922852, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.883878231048584, + "num_tokens": 491945396.0, + "step": 12890 + }, + { + "epoch": 1.6398677013102658, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6502381563186646, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8819792866706848, + "num_tokens": 491976944.0, + "step": 12891 + }, + { + "epoch": 1.6399949115888564, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5774396657943726, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8839967250823975, + "num_tokens": 492018037.0, + "step": 12892 + }, + { + "epoch": 1.6401221218674469, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5198777914047241, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8642409443855286, + "num_tokens": 492065550.0, + "step": 12893 + }, + { + "epoch": 1.6402493321460374, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6528629064559937, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8741163015365601, + "num_tokens": 492103496.0, + "step": 12894 + }, + { + "epoch": 1.640376542424628, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6717077493667603, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8717779517173767, + "num_tokens": 492137752.0, + "step": 12895 + }, + { + "epoch": 1.6405037527032185, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.4333020448684692, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8714115619659424, + "num_tokens": 492181607.0, + "step": 12896 + }, + { + "epoch": 1.6406309629818088, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.587352991104126, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8802725076675415, + "num_tokens": 492222307.0, + "step": 12897 + }, + { + "epoch": 1.6407581732603993, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7596830129623413, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8828437924385071, + "num_tokens": 492254915.0, + "step": 12898 + }, + { + "epoch": 1.6408853835389898, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.4680811166763306, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8909632563591003, + "num_tokens": 492296622.0, + "step": 12899 + }, + { + "epoch": 1.6410125938175804, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5144447088241577, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.872692346572876, + "num_tokens": 492338937.0, + "step": 12900 + }, + { + "epoch": 1.6411398040961709, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.8278096914291382, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8535158634185791, + "num_tokens": 492371845.0, + "step": 12901 + }, + { + "epoch": 1.6412670143747614, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.8368065357208252, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8777016401290894, + "num_tokens": 492405143.0, + "step": 12902 + }, + { + "epoch": 1.641394224653352, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6781036853790283, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8667107820510864, + "num_tokens": 492441491.0, + "step": 12903 + }, + { + "epoch": 1.6415214349319425, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5182439088821411, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8743362426757812, + "num_tokens": 492481632.0, + "step": 12904 + }, + { + "epoch": 1.641648645210533, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.650215983390808, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.874967098236084, + "num_tokens": 492519264.0, + "step": 12905 + }, + { + "epoch": 1.6417758554891235, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6619292497634888, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8748021125793457, + "num_tokens": 492554662.0, + "step": 12906 + }, + { + "epoch": 1.641903065767714, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.487979531288147, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8792188167572021, + "num_tokens": 492601779.0, + "step": 12907 + }, + { + "epoch": 1.6420302760463046, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6357226371765137, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8784293532371521, + "num_tokens": 492636559.0, + "step": 12908 + }, + { + "epoch": 1.642157486324895, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5191518068313599, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8821989297866821, + "num_tokens": 492674832.0, + "step": 12909 + }, + { + "epoch": 1.6422846966034856, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.564643383026123, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8717902302742004, + "num_tokens": 492713839.0, + "step": 12910 + }, + { + "epoch": 1.6424119068820762, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.612721562385559, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8753765225410461, + "num_tokens": 492747570.0, + "step": 12911 + }, + { + "epoch": 1.6425391171606667, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6405305862426758, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8673306703567505, + "num_tokens": 492786141.0, + "step": 12912 + }, + { + "epoch": 1.6426663274392572, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6423922777175903, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8491607904434204, + "num_tokens": 492827570.0, + "step": 12913 + }, + { + "epoch": 1.6427935377178478, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5251052379608154, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8627471923828125, + "num_tokens": 492871316.0, + "step": 12914 + }, + { + "epoch": 1.642920747996438, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6019188165664673, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8616921305656433, + "num_tokens": 492910781.0, + "step": 12915 + }, + { + "epoch": 1.6430479582750286, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6454325914382935, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.874029278755188, + "num_tokens": 492949462.0, + "step": 12916 + }, + { + "epoch": 1.6431751685536191, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5605367422103882, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8733838200569153, + "num_tokens": 492988753.0, + "step": 12917 + }, + { + "epoch": 1.6433023788322096, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5846596956253052, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8786982297897339, + "num_tokens": 493030294.0, + "step": 12918 + }, + { + "epoch": 1.6434295891108002, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5771825313568115, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8729003071784973, + "num_tokens": 493066221.0, + "step": 12919 + }, + { + "epoch": 1.6435567993893907, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.4883129596710205, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.873920202255249, + "num_tokens": 493107760.0, + "step": 12920 + }, + { + "epoch": 1.6436840096679812, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5155291557312012, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8906381726264954, + "num_tokens": 493147569.0, + "step": 12921 + }, + { + "epoch": 1.6438112199465715, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6798770427703857, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8746993541717529, + "num_tokens": 493185940.0, + "step": 12922 + }, + { + "epoch": 1.643938430225162, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.776212453842163, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8757707476615906, + "num_tokens": 493224598.0, + "step": 12923 + }, + { + "epoch": 1.6440656405037526, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7248239517211914, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8536607027053833, + "num_tokens": 493265599.0, + "step": 12924 + }, + { + "epoch": 1.6441928507823431, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6949913501739502, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8497167825698853, + "num_tokens": 493300967.0, + "step": 12925 + }, + { + "epoch": 1.6443200610609336, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.540347695350647, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8756123781204224, + "num_tokens": 493344963.0, + "step": 12926 + }, + { + "epoch": 1.6444472713395242, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6626614332199097, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8822194337844849, + "num_tokens": 493383632.0, + "step": 12927 + }, + { + "epoch": 1.6445744816181147, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6879067420959473, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8815590739250183, + "num_tokens": 493420570.0, + "step": 12928 + }, + { + "epoch": 1.6447016918967052, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7014495134353638, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8621208071708679, + "num_tokens": 493461580.0, + "step": 12929 + }, + { + "epoch": 1.6448289021752958, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.4359906911849976, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8701184988021851, + "num_tokens": 493504751.0, + "step": 12930 + }, + { + "epoch": 1.6449561124538863, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.8130974769592285, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8588771820068359, + "num_tokens": 493537274.0, + "step": 12931 + }, + { + "epoch": 1.6450833227324768, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7195417881011963, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8779263496398926, + "num_tokens": 493573929.0, + "step": 12932 + }, + { + "epoch": 1.6452105330110673, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7478418350219727, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8597210049629211, + "num_tokens": 493606385.0, + "step": 12933 + }, + { + "epoch": 1.6453377432896579, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7021745443344116, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8696433305740356, + "num_tokens": 493643240.0, + "step": 12934 + }, + { + "epoch": 1.6454649535682484, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5798959732055664, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8802604079246521, + "num_tokens": 493680462.0, + "step": 12935 + }, + { + "epoch": 1.645592163846839, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6112661361694336, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8860670328140259, + "num_tokens": 493714417.0, + "step": 12936 + }, + { + "epoch": 1.6457193741254295, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6116812229156494, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8714887499809265, + "num_tokens": 493752940.0, + "step": 12937 + }, + { + "epoch": 1.64584658440402, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6164294481277466, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8846385478973389, + "num_tokens": 493789861.0, + "step": 12938 + }, + { + "epoch": 1.6459737946826105, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.4934114217758179, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8825953602790833, + "num_tokens": 493829492.0, + "step": 12939 + }, + { + "epoch": 1.6461010049612008, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.83077871799469, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8873289227485657, + "num_tokens": 493865810.0, + "step": 12940 + }, + { + "epoch": 1.6462282152397913, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6661368608474731, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8671767711639404, + "num_tokens": 493903523.0, + "step": 12941 + }, + { + "epoch": 1.6463554255183819, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6196892261505127, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8678637742996216, + "num_tokens": 493940219.0, + "step": 12942 + }, + { + "epoch": 1.6464826357969724, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5866672992706299, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8702650666236877, + "num_tokens": 493982074.0, + "step": 12943 + }, + { + "epoch": 1.646609846075563, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6189512014389038, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8728350400924683, + "num_tokens": 494020280.0, + "step": 12944 + }, + { + "epoch": 1.6467370563541535, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5974470376968384, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.86908358335495, + "num_tokens": 494061528.0, + "step": 12945 + }, + { + "epoch": 1.6468642666327438, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5032824277877808, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8760503530502319, + "num_tokens": 494101105.0, + "step": 12946 + }, + { + "epoch": 1.6469914769113343, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5910303592681885, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8826519846916199, + "num_tokens": 494137759.0, + "step": 12947 + }, + { + "epoch": 1.6471186871899248, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5854413509368896, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8757128119468689, + "num_tokens": 494177119.0, + "step": 12948 + }, + { + "epoch": 1.6472458974685154, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.8918832540512085, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8473397493362427, + "num_tokens": 494213443.0, + "step": 12949 + }, + { + "epoch": 1.6473731077471059, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6102737188339233, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8637140989303589, + "num_tokens": 494253383.0, + "step": 12950 + }, + { + "epoch": 1.6475003180256964, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5894880294799805, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8743314743041992, + "num_tokens": 494290620.0, + "step": 12951 + }, + { + "epoch": 1.647627528304287, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6651391983032227, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8733007907867432, + "num_tokens": 494330308.0, + "step": 12952 + }, + { + "epoch": 1.6477547385828775, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7436461448669434, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8748718500137329, + "num_tokens": 494362093.0, + "step": 12953 + }, + { + "epoch": 1.647881948861468, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5678337812423706, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8481786251068115, + "num_tokens": 494407318.0, + "step": 12954 + }, + { + "epoch": 1.6480091591400585, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7385609149932861, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8610330820083618, + "num_tokens": 494445798.0, + "step": 12955 + }, + { + "epoch": 1.648136369418649, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5764319896697998, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8871707320213318, + "num_tokens": 494482872.0, + "step": 12956 + }, + { + "epoch": 1.6482635796972396, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5073106288909912, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8760390877723694, + "num_tokens": 494524245.0, + "step": 12957 + }, + { + "epoch": 1.64839078997583, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.4163949489593506, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8722445964813232, + "num_tokens": 494571270.0, + "step": 12958 + }, + { + "epoch": 1.6485180002544206, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7016161680221558, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8608916997909546, + "num_tokens": 494606451.0, + "step": 12959 + }, + { + "epoch": 1.6486452105330112, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5120961666107178, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8819851875305176, + "num_tokens": 494646303.0, + "step": 12960 + }, + { + "epoch": 1.6487724208116017, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7043981552124023, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.860171914100647, + "num_tokens": 494680696.0, + "step": 12961 + }, + { + "epoch": 1.6488996310901922, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7199736833572388, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.855787456035614, + "num_tokens": 494717992.0, + "step": 12962 + }, + { + "epoch": 1.6490268413687827, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5110833644866943, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8807483911514282, + "num_tokens": 494757499.0, + "step": 12963 + }, + { + "epoch": 1.649154051647373, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.684906244277954, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8805269002914429, + "num_tokens": 494792016.0, + "step": 12964 + }, + { + "epoch": 1.6492812619259636, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6366031169891357, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8625011444091797, + "num_tokens": 494829446.0, + "step": 12965 + }, + { + "epoch": 1.649408472204554, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5546437501907349, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8755318522453308, + "num_tokens": 494868073.0, + "step": 12966 + }, + { + "epoch": 1.6495356824831446, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6096420288085938, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8687800168991089, + "num_tokens": 494909484.0, + "step": 12967 + }, + { + "epoch": 1.6496628927617352, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6450270414352417, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8649543523788452, + "num_tokens": 494949368.0, + "step": 12968 + }, + { + "epoch": 1.6497901030403257, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6057127714157104, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.874383270740509, + "num_tokens": 494986241.0, + "step": 12969 + }, + { + "epoch": 1.6499173133189162, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.3958297967910767, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8757203817367554, + "num_tokens": 495032291.0, + "step": 12970 + }, + { + "epoch": 1.6500445235975065, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6567665338516235, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8810528516769409, + "num_tokens": 495067280.0, + "step": 12971 + }, + { + "epoch": 1.650171733876097, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6749188899993896, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8794615268707275, + "num_tokens": 495104261.0, + "step": 12972 + }, + { + "epoch": 1.6502989441546876, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5285321474075317, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8885448575019836, + "num_tokens": 495142133.0, + "step": 12973 + }, + { + "epoch": 1.6504261544332781, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6074633598327637, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8884329795837402, + "num_tokens": 495178017.0, + "step": 12974 + }, + { + "epoch": 1.6505533647118686, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6339633464813232, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.850579023361206, + "num_tokens": 495217994.0, + "step": 12975 + }, + { + "epoch": 1.6506805749904592, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6400535106658936, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8629922866821289, + "num_tokens": 495256261.0, + "step": 12976 + }, + { + "epoch": 1.6508077852690497, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5948818922042847, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8694336414337158, + "num_tokens": 495299907.0, + "step": 12977 + }, + { + "epoch": 1.6509349955476402, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7886431217193604, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8897616267204285, + "num_tokens": 495329167.0, + "step": 12978 + }, + { + "epoch": 1.6510622058262308, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6571139097213745, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8674284815788269, + "num_tokens": 495366534.0, + "step": 12979 + }, + { + "epoch": 1.6511894161048213, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5827398300170898, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8694172501564026, + "num_tokens": 495403714.0, + "step": 12980 + }, + { + "epoch": 1.6513166263834118, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5412853956222534, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8893139362335205, + "num_tokens": 495444103.0, + "step": 12981 + }, + { + "epoch": 1.6514438366620023, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7411147356033325, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8690181970596313, + "num_tokens": 495480407.0, + "step": 12982 + }, + { + "epoch": 1.6515710469405929, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.6128441095352173, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8732807040214539, + "num_tokens": 495519462.0, + "step": 12983 + }, + { + "epoch": 1.6516982572191834, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5283622741699219, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.882013738155365, + "num_tokens": 495559893.0, + "step": 12984 + }, + { + "epoch": 1.651825467497774, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5697524547576904, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8675845861434937, + "num_tokens": 495601948.0, + "step": 12985 + }, + { + "epoch": 1.6519526777763645, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.664088487625122, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8690122365951538, + "num_tokens": 495640222.0, + "step": 12986 + }, + { + "epoch": 1.652079888054955, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5922496318817139, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.878328800201416, + "num_tokens": 495680013.0, + "step": 12987 + }, + { + "epoch": 1.6522070983335455, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6435471773147583, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8673042058944702, + "num_tokens": 495717606.0, + "step": 12988 + }, + { + "epoch": 1.6523343086121358, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7101222276687622, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8797263503074646, + "num_tokens": 495753336.0, + "step": 12989 + }, + { + "epoch": 1.6524615188907263, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.5205144882202148, + "learning_rate": 1e-06, + "loss": 0.3256, + "mean_token_accuracy": 0.8929818868637085, + "num_tokens": 495791013.0, + "step": 12990 + }, + { + "epoch": 1.6525887291693169, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6594513654708862, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8715435266494751, + "num_tokens": 495828018.0, + "step": 12991 + }, + { + "epoch": 1.6527159394479074, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6351901292800903, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8685164451599121, + "num_tokens": 495866044.0, + "step": 12992 + }, + { + "epoch": 1.652843149726498, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6556665897369385, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8512254953384399, + "num_tokens": 495903217.0, + "step": 12993 + }, + { + "epoch": 1.6529703600050885, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6495773792266846, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8705770373344421, + "num_tokens": 495942163.0, + "step": 12994 + }, + { + "epoch": 1.6530975702836788, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.575203776359558, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8725623488426208, + "num_tokens": 495982168.0, + "step": 12995 + }, + { + "epoch": 1.6532247805622693, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7644145488739014, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8585265874862671, + "num_tokens": 496018343.0, + "step": 12996 + }, + { + "epoch": 1.6533519908408598, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6378610134124756, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8590602874755859, + "num_tokens": 496056911.0, + "step": 12997 + }, + { + "epoch": 1.6534792011194503, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.539980173110962, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8669407367706299, + "num_tokens": 496098386.0, + "step": 12998 + }, + { + "epoch": 1.6536064113980409, + "ewc_loss": 2.4080276489257812e-05, + "grad_norm": 1.7241381406784058, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.852279782295227, + "num_tokens": 496136313.0, + "step": 12999 + }, + { + "epoch": 1.6537336216766314, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7212468385696411, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8702524900436401, + "num_tokens": 496171512.0, + "step": 13000 + }, + { + "epoch": 1.653860831955222, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5606144666671753, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8708341121673584, + "num_tokens": 496208642.0, + "step": 13001 + }, + { + "epoch": 1.6539880422338125, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6013271808624268, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8738274574279785, + "num_tokens": 496246328.0, + "step": 13002 + }, + { + "epoch": 1.654115252512403, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6238644123077393, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.870570719242096, + "num_tokens": 496285367.0, + "step": 13003 + }, + { + "epoch": 1.6542424627909935, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6110961437225342, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8653812408447266, + "num_tokens": 496328932.0, + "step": 13004 + }, + { + "epoch": 1.654369673069584, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6574559211730957, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8831283450126648, + "num_tokens": 496362879.0, + "step": 13005 + }, + { + "epoch": 1.6544968833481746, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6988613605499268, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8687068223953247, + "num_tokens": 496397592.0, + "step": 13006 + }, + { + "epoch": 1.654624093626765, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.479123830795288, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8587530255317688, + "num_tokens": 496444474.0, + "step": 13007 + }, + { + "epoch": 1.6547513039053556, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5983929634094238, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8602404594421387, + "num_tokens": 496485261.0, + "step": 13008 + }, + { + "epoch": 1.6548785141839462, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.65498948097229, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8564105033874512, + "num_tokens": 496523607.0, + "step": 13009 + }, + { + "epoch": 1.6550057244625367, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7603925466537476, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.856608510017395, + "num_tokens": 496556207.0, + "step": 13010 + }, + { + "epoch": 1.6551329347411272, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.568855881690979, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8740817308425903, + "num_tokens": 496594567.0, + "step": 13011 + }, + { + "epoch": 1.6552601450197177, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5502337217330933, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8757539987564087, + "num_tokens": 496633771.0, + "step": 13012 + }, + { + "epoch": 1.655387355298308, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7608686685562134, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8742190599441528, + "num_tokens": 496663493.0, + "step": 13013 + }, + { + "epoch": 1.6555145655768986, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4585330486297607, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8757883310317993, + "num_tokens": 496704637.0, + "step": 13014 + }, + { + "epoch": 1.655641775855489, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5812503099441528, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.866003155708313, + "num_tokens": 496743169.0, + "step": 13015 + }, + { + "epoch": 1.6557689861340796, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5589993000030518, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8678985834121704, + "num_tokens": 496783664.0, + "step": 13016 + }, + { + "epoch": 1.6558961964126702, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.524222493171692, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8603805899620056, + "num_tokens": 496825977.0, + "step": 13017 + }, + { + "epoch": 1.6560234066912607, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6109082698822021, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8700023889541626, + "num_tokens": 496866739.0, + "step": 13018 + }, + { + "epoch": 1.6561506169698512, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5872710943222046, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.883948028087616, + "num_tokens": 496902524.0, + "step": 13019 + }, + { + "epoch": 1.6562778272484415, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6339592933654785, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8732462525367737, + "num_tokens": 496939989.0, + "step": 13020 + }, + { + "epoch": 1.656405037527032, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.8307348489761353, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8645762801170349, + "num_tokens": 496973301.0, + "step": 13021 + }, + { + "epoch": 1.6565322478056226, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.597964882850647, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8780034184455872, + "num_tokens": 497013154.0, + "step": 13022 + }, + { + "epoch": 1.656659458084213, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6565756797790527, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.862057626247406, + "num_tokens": 497052417.0, + "step": 13023 + }, + { + "epoch": 1.6567866683628036, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5627135038375854, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8581908941268921, + "num_tokens": 497096001.0, + "step": 13024 + }, + { + "epoch": 1.6569138786413942, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7874709367752075, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8732150793075562, + "num_tokens": 497131354.0, + "step": 13025 + }, + { + "epoch": 1.6570410889199847, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.662009596824646, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8561299443244934, + "num_tokens": 497167764.0, + "step": 13026 + }, + { + "epoch": 1.6571682991985752, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5237361192703247, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8758786916732788, + "num_tokens": 497206143.0, + "step": 13027 + }, + { + "epoch": 1.6572955094771658, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6682285070419312, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8680642247200012, + "num_tokens": 497239360.0, + "step": 13028 + }, + { + "epoch": 1.6574227197557563, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5342788696289062, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8753081560134888, + "num_tokens": 497278274.0, + "step": 13029 + }, + { + "epoch": 1.6575499300343468, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.9493205547332764, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8694037199020386, + "num_tokens": 497307882.0, + "step": 13030 + }, + { + "epoch": 1.6576771403129373, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6416183710098267, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8902585506439209, + "num_tokens": 497340759.0, + "step": 13031 + }, + { + "epoch": 1.6578043505915279, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5080121755599976, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8860312700271606, + "num_tokens": 497381321.0, + "step": 13032 + }, + { + "epoch": 1.6579315608701184, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5528409481048584, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8609741926193237, + "num_tokens": 497423034.0, + "step": 13033 + }, + { + "epoch": 1.658058771148709, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6033201217651367, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.868561327457428, + "num_tokens": 497461972.0, + "step": 13034 + }, + { + "epoch": 1.6581859814272994, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5852125883102417, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8698951601982117, + "num_tokens": 497505277.0, + "step": 13035 + }, + { + "epoch": 1.65831319170589, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6265785694122314, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8798522353172302, + "num_tokens": 497542028.0, + "step": 13036 + }, + { + "epoch": 1.6584404019844805, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.8252828121185303, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8509722948074341, + "num_tokens": 497578194.0, + "step": 13037 + }, + { + "epoch": 1.6585676122630708, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5237818956375122, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8717705011367798, + "num_tokens": 497617849.0, + "step": 13038 + }, + { + "epoch": 1.6586948225416613, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5402414798736572, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8731001019477844, + "num_tokens": 497657936.0, + "step": 13039 + }, + { + "epoch": 1.6588220328202519, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5623059272766113, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8762686252593994, + "num_tokens": 497697409.0, + "step": 13040 + }, + { + "epoch": 1.6589492430988424, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7015862464904785, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8572081327438354, + "num_tokens": 497739374.0, + "step": 13041 + }, + { + "epoch": 1.659076453377433, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6072596311569214, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8706638216972351, + "num_tokens": 497779521.0, + "step": 13042 + }, + { + "epoch": 1.6592036636560235, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6275010108947754, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.854429304599762, + "num_tokens": 497821649.0, + "step": 13043 + }, + { + "epoch": 1.6593308739346138, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6394082307815552, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8644659519195557, + "num_tokens": 497859216.0, + "step": 13044 + }, + { + "epoch": 1.6594580842132043, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6472078561782837, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.868995189666748, + "num_tokens": 497893126.0, + "step": 13045 + }, + { + "epoch": 1.6595852944917948, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6704041957855225, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8744251728057861, + "num_tokens": 497930391.0, + "step": 13046 + }, + { + "epoch": 1.6597125047703853, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7751939296722412, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8522990942001343, + "num_tokens": 497967974.0, + "step": 13047 + }, + { + "epoch": 1.6598397150489759, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6215142011642456, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8592354655265808, + "num_tokens": 498006063.0, + "step": 13048 + }, + { + "epoch": 1.6599669253275664, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6020559072494507, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8744304180145264, + "num_tokens": 498041742.0, + "step": 13049 + }, + { + "epoch": 1.660094135606157, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7552658319473267, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8552734851837158, + "num_tokens": 498077599.0, + "step": 13050 + }, + { + "epoch": 1.6602213458847475, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.619023084640503, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8571299314498901, + "num_tokens": 498123556.0, + "step": 13051 + }, + { + "epoch": 1.660348556163338, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.623830795288086, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8826214075088501, + "num_tokens": 498156921.0, + "step": 13052 + }, + { + "epoch": 1.6604757664419285, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4827693700790405, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8867125511169434, + "num_tokens": 498194601.0, + "step": 13053 + }, + { + "epoch": 1.660602976720519, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4560673236846924, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8803394436836243, + "num_tokens": 498240107.0, + "step": 13054 + }, + { + "epoch": 1.6607301869991096, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4676649570465088, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8861082792282104, + "num_tokens": 498279813.0, + "step": 13055 + }, + { + "epoch": 1.6608573972777, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7469221353530884, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8656942844390869, + "num_tokens": 498315999.0, + "step": 13056 + }, + { + "epoch": 1.6609846075562906, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5632699728012085, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8773208856582642, + "num_tokens": 498357177.0, + "step": 13057 + }, + { + "epoch": 1.6611118178348812, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6797581911087036, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8732146620750427, + "num_tokens": 498389221.0, + "step": 13058 + }, + { + "epoch": 1.6612390281134717, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.56094491481781, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8888276219367981, + "num_tokens": 498424250.0, + "step": 13059 + }, + { + "epoch": 1.6613662383920622, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7551000118255615, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8650217056274414, + "num_tokens": 498461588.0, + "step": 13060 + }, + { + "epoch": 1.6614934486706527, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5712127685546875, + "learning_rate": 1e-06, + "loss": 0.3225, + "mean_token_accuracy": 0.8947203159332275, + "num_tokens": 498495579.0, + "step": 13061 + }, + { + "epoch": 1.661620658949243, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5463043451309204, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8696964383125305, + "num_tokens": 498537092.0, + "step": 13062 + }, + { + "epoch": 1.6617478692278336, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6225359439849854, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8776946067810059, + "num_tokens": 498573992.0, + "step": 13063 + }, + { + "epoch": 1.661875079506424, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6474523544311523, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8724343776702881, + "num_tokens": 498610336.0, + "step": 13064 + }, + { + "epoch": 1.6620022897850146, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6822878122329712, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.865936815738678, + "num_tokens": 498646775.0, + "step": 13065 + }, + { + "epoch": 1.6621295000636052, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5825741291046143, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8630880117416382, + "num_tokens": 498687403.0, + "step": 13066 + }, + { + "epoch": 1.6622567103421957, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5670640468597412, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8796261548995972, + "num_tokens": 498727772.0, + "step": 13067 + }, + { + "epoch": 1.662383920620786, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.579156517982483, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8618761301040649, + "num_tokens": 498771483.0, + "step": 13068 + }, + { + "epoch": 1.6625111308993765, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4899091720581055, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8752695322036743, + "num_tokens": 498814798.0, + "step": 13069 + }, + { + "epoch": 1.662638341177967, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5946956872940063, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8782439827919006, + "num_tokens": 498851743.0, + "step": 13070 + }, + { + "epoch": 1.6627655514565576, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5686671733856201, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8666990995407104, + "num_tokens": 498887251.0, + "step": 13071 + }, + { + "epoch": 1.662892761735148, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.627143144607544, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8674697875976562, + "num_tokens": 498924111.0, + "step": 13072 + }, + { + "epoch": 1.6630199720137386, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7011492252349854, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8673669099807739, + "num_tokens": 498964510.0, + "step": 13073 + }, + { + "epoch": 1.6631471822923292, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.538677453994751, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8708704710006714, + "num_tokens": 499006093.0, + "step": 13074 + }, + { + "epoch": 1.6632743925709197, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6864795684814453, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8576776385307312, + "num_tokens": 499043158.0, + "step": 13075 + }, + { + "epoch": 1.6634016028495102, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6435956954956055, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8767848014831543, + "num_tokens": 499077800.0, + "step": 13076 + }, + { + "epoch": 1.6635288131281007, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4915729761123657, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8794208765029907, + "num_tokens": 499121122.0, + "step": 13077 + }, + { + "epoch": 1.6636560234066913, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5643771886825562, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8748075366020203, + "num_tokens": 499159644.0, + "step": 13078 + }, + { + "epoch": 1.6637832336852818, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6967124938964844, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8811416625976562, + "num_tokens": 499197282.0, + "step": 13079 + }, + { + "epoch": 1.6639104439638723, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6597936153411865, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8682553768157959, + "num_tokens": 499237395.0, + "step": 13080 + }, + { + "epoch": 1.6640376542424629, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.554917335510254, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8671746253967285, + "num_tokens": 499278337.0, + "step": 13081 + }, + { + "epoch": 1.6641648645210534, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7426098585128784, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8723748326301575, + "num_tokens": 499314122.0, + "step": 13082 + }, + { + "epoch": 1.664292074799644, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6471151113510132, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8587956428527832, + "num_tokens": 499358820.0, + "step": 13083 + }, + { + "epoch": 1.6644192850782344, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4214859008789062, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8791400790214539, + "num_tokens": 499404143.0, + "step": 13084 + }, + { + "epoch": 1.664546495356825, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7521742582321167, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8685638904571533, + "num_tokens": 499436665.0, + "step": 13085 + }, + { + "epoch": 1.6646737056354155, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5349396467208862, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8677268624305725, + "num_tokens": 499479212.0, + "step": 13086 + }, + { + "epoch": 1.6648009159140058, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5758072137832642, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8656017780303955, + "num_tokens": 499519608.0, + "step": 13087 + }, + { + "epoch": 1.6649281261925963, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7843742370605469, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8698662519454956, + "num_tokens": 499553464.0, + "step": 13088 + }, + { + "epoch": 1.6650553364711869, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7472057342529297, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.862942099571228, + "num_tokens": 499588884.0, + "step": 13089 + }, + { + "epoch": 1.6651825467497774, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7562079429626465, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8587415218353271, + "num_tokens": 499627945.0, + "step": 13090 + }, + { + "epoch": 1.665309757028368, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7036387920379639, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8719425201416016, + "num_tokens": 499663226.0, + "step": 13091 + }, + { + "epoch": 1.6654369673069584, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5537220239639282, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8653049468994141, + "num_tokens": 499705858.0, + "step": 13092 + }, + { + "epoch": 1.6655641775855488, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5618239641189575, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8575084209442139, + "num_tokens": 499748493.0, + "step": 13093 + }, + { + "epoch": 1.6656913878641393, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5944849252700806, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8820371627807617, + "num_tokens": 499791262.0, + "step": 13094 + }, + { + "epoch": 1.6658185981427298, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6121487617492676, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8534557819366455, + "num_tokens": 499829393.0, + "step": 13095 + }, + { + "epoch": 1.6659458084213203, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6093782186508179, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8623589277267456, + "num_tokens": 499869415.0, + "step": 13096 + }, + { + "epoch": 1.6660730186999109, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6610262393951416, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8653284907341003, + "num_tokens": 499908865.0, + "step": 13097 + }, + { + "epoch": 1.6662002289785014, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5556745529174805, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8754774332046509, + "num_tokens": 499945963.0, + "step": 13098 + }, + { + "epoch": 1.666327439257092, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7612648010253906, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8505406379699707, + "num_tokens": 499981476.0, + "step": 13099 + }, + { + "epoch": 1.6664546495356825, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.564545750617981, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8674267530441284, + "num_tokens": 500022726.0, + "step": 13100 + }, + { + "epoch": 1.666581859814273, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.568454384803772, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8663917779922485, + "num_tokens": 500064030.0, + "step": 13101 + }, + { + "epoch": 1.6667090700928635, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4468152523040771, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8850548267364502, + "num_tokens": 500103751.0, + "step": 13102 + }, + { + "epoch": 1.666836280371454, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.630122423171997, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8796531558036804, + "num_tokens": 500137649.0, + "step": 13103 + }, + { + "epoch": 1.6669634906500446, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5125484466552734, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8772156238555908, + "num_tokens": 500180071.0, + "step": 13104 + }, + { + "epoch": 1.667090700928635, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7870385646820068, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8553364276885986, + "num_tokens": 500214381.0, + "step": 13105 + }, + { + "epoch": 1.6672179112072256, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6802781820297241, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8754209876060486, + "num_tokens": 500248815.0, + "step": 13106 + }, + { + "epoch": 1.6673451214858162, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5575734376907349, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8848764300346375, + "num_tokens": 500290022.0, + "step": 13107 + }, + { + "epoch": 1.6674723317644067, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7298932075500488, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8804965615272522, + "num_tokens": 500319299.0, + "step": 13108 + }, + { + "epoch": 1.6675995420429972, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6553722620010376, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.870126485824585, + "num_tokens": 500355114.0, + "step": 13109 + }, + { + "epoch": 1.6677267523215877, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.688903570175171, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8796364068984985, + "num_tokens": 500392774.0, + "step": 13110 + }, + { + "epoch": 1.667853962600178, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5231974124908447, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8797076940536499, + "num_tokens": 500432320.0, + "step": 13111 + }, + { + "epoch": 1.6679811728787686, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.602689504623413, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8697887659072876, + "num_tokens": 500470509.0, + "step": 13112 + }, + { + "epoch": 1.668108383157359, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5998928546905518, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8787527680397034, + "num_tokens": 500507500.0, + "step": 13113 + }, + { + "epoch": 1.6682355934359496, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4188504219055176, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8683743476867676, + "num_tokens": 500553867.0, + "step": 13114 + }, + { + "epoch": 1.6683628037145402, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6697518825531006, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8530925512313843, + "num_tokens": 500590057.0, + "step": 13115 + }, + { + "epoch": 1.6684900139931307, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4450929164886475, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8833715915679932, + "num_tokens": 500627941.0, + "step": 13116 + }, + { + "epoch": 1.668617224271721, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6199052333831787, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8693771362304688, + "num_tokens": 500666071.0, + "step": 13117 + }, + { + "epoch": 1.6687444345503115, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5885818004608154, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8790990114212036, + "num_tokens": 500701338.0, + "step": 13118 + }, + { + "epoch": 1.668871644828902, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4775766134262085, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8643931746482849, + "num_tokens": 500744897.0, + "step": 13119 + }, + { + "epoch": 1.6689988551074926, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.605107307434082, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8746857643127441, + "num_tokens": 500784048.0, + "step": 13120 + }, + { + "epoch": 1.669126065386083, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7625865936279297, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8672803640365601, + "num_tokens": 500817214.0, + "step": 13121 + }, + { + "epoch": 1.6692532756646736, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6025811433792114, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8635724782943726, + "num_tokens": 500856618.0, + "step": 13122 + }, + { + "epoch": 1.6693804859432642, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6570721864700317, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8677683472633362, + "num_tokens": 500891270.0, + "step": 13123 + }, + { + "epoch": 1.6695076962218547, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.594971776008606, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.861518144607544, + "num_tokens": 500931246.0, + "step": 13124 + }, + { + "epoch": 1.6696349065004452, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5033522844314575, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8809971213340759, + "num_tokens": 500970281.0, + "step": 13125 + }, + { + "epoch": 1.6697621167790357, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6521652936935425, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8491976261138916, + "num_tokens": 501014948.0, + "step": 13126 + }, + { + "epoch": 1.6698893270576263, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5539681911468506, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.868037223815918, + "num_tokens": 501056361.0, + "step": 13127 + }, + { + "epoch": 1.6700165373362168, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5513052940368652, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8758603930473328, + "num_tokens": 501093908.0, + "step": 13128 + }, + { + "epoch": 1.6701437476148073, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.613501787185669, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8746179342269897, + "num_tokens": 501130692.0, + "step": 13129 + }, + { + "epoch": 1.6702709578933979, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6938878297805786, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8563157916069031, + "num_tokens": 501165635.0, + "step": 13130 + }, + { + "epoch": 1.6703981681719884, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7542893886566162, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.84450364112854, + "num_tokens": 501200903.0, + "step": 13131 + }, + { + "epoch": 1.670525378450579, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6133997440338135, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8763388991355896, + "num_tokens": 501235288.0, + "step": 13132 + }, + { + "epoch": 1.6706525887291694, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.573016881942749, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.875572919845581, + "num_tokens": 501274211.0, + "step": 13133 + }, + { + "epoch": 1.67077979900776, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5840243101119995, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8729448318481445, + "num_tokens": 501312270.0, + "step": 13134 + }, + { + "epoch": 1.6709070092863505, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6102384328842163, + "learning_rate": 1e-06, + "loss": 0.3155, + "mean_token_accuracy": 0.8961296081542969, + "num_tokens": 501344330.0, + "step": 13135 + }, + { + "epoch": 1.6710342195649408, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7244467735290527, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8686771392822266, + "num_tokens": 501380296.0, + "step": 13136 + }, + { + "epoch": 1.6711614298435313, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5948164463043213, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8824118971824646, + "num_tokens": 501418841.0, + "step": 13137 + }, + { + "epoch": 1.6712886401221219, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5318355560302734, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8755096197128296, + "num_tokens": 501461998.0, + "step": 13138 + }, + { + "epoch": 1.6714158504007124, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.591870903968811, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8917759656906128, + "num_tokens": 501499012.0, + "step": 13139 + }, + { + "epoch": 1.671543060679303, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7099175453186035, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8617846965789795, + "num_tokens": 501530734.0, + "step": 13140 + }, + { + "epoch": 1.6716702709578934, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6745233535766602, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8630880117416382, + "num_tokens": 501569042.0, + "step": 13141 + }, + { + "epoch": 1.6717974812364838, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5560336112976074, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8813151717185974, + "num_tokens": 501608056.0, + "step": 13142 + }, + { + "epoch": 1.6719246915150743, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7073867321014404, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8743970394134521, + "num_tokens": 501641896.0, + "step": 13143 + }, + { + "epoch": 1.6720519017936648, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6805002689361572, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.870800256729126, + "num_tokens": 501683138.0, + "step": 13144 + }, + { + "epoch": 1.6721791120722553, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5936237573623657, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8693708181381226, + "num_tokens": 501722523.0, + "step": 13145 + }, + { + "epoch": 1.6723063223508459, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5782805681228638, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8614441156387329, + "num_tokens": 501765624.0, + "step": 13146 + }, + { + "epoch": 1.6724335326294364, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5225285291671753, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8691960573196411, + "num_tokens": 501806675.0, + "step": 13147 + }, + { + "epoch": 1.672560742908027, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7445857524871826, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8571023941040039, + "num_tokens": 501844992.0, + "step": 13148 + }, + { + "epoch": 1.6726879531866174, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.7950985431671143, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8659111261367798, + "num_tokens": 501883276.0, + "step": 13149 + }, + { + "epoch": 1.672815163465208, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6282364130020142, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8670571446418762, + "num_tokens": 501924760.0, + "step": 13150 + }, + { + "epoch": 1.6729423737437985, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6889660358428955, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8486124277114868, + "num_tokens": 501961547.0, + "step": 13151 + }, + { + "epoch": 1.673069584022389, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5803391933441162, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8641446828842163, + "num_tokens": 502001533.0, + "step": 13152 + }, + { + "epoch": 1.6731967943009796, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5619677305221558, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8824901580810547, + "num_tokens": 502036249.0, + "step": 13153 + }, + { + "epoch": 1.67332400457957, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6186221837997437, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8752780556678772, + "num_tokens": 502071739.0, + "step": 13154 + }, + { + "epoch": 1.6734512148581606, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7634674310684204, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8751967549324036, + "num_tokens": 502105005.0, + "step": 13155 + }, + { + "epoch": 1.6735784251367511, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7214791774749756, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8723465800285339, + "num_tokens": 502138666.0, + "step": 13156 + }, + { + "epoch": 1.6737056354153417, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5636900663375854, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.869837760925293, + "num_tokens": 502179455.0, + "step": 13157 + }, + { + "epoch": 1.6738328456939322, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5513368844985962, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8703860640525818, + "num_tokens": 502220867.0, + "step": 13158 + }, + { + "epoch": 1.6739600559725227, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5192079544067383, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8765679597854614, + "num_tokens": 502261891.0, + "step": 13159 + }, + { + "epoch": 1.674087266251113, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6724728345870972, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8723622560501099, + "num_tokens": 502299858.0, + "step": 13160 + }, + { + "epoch": 1.6742144765297036, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6258859634399414, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8843122720718384, + "num_tokens": 502337768.0, + "step": 13161 + }, + { + "epoch": 1.674341686808294, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6142728328704834, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8552266955375671, + "num_tokens": 502378197.0, + "step": 13162 + }, + { + "epoch": 1.6744688970868846, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5754170417785645, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8781746625900269, + "num_tokens": 502416647.0, + "step": 13163 + }, + { + "epoch": 1.6745961073654752, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4911807775497437, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8642871379852295, + "num_tokens": 502461068.0, + "step": 13164 + }, + { + "epoch": 1.6747233176440657, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4796425104141235, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8797469735145569, + "num_tokens": 502505873.0, + "step": 13165 + }, + { + "epoch": 1.674850527922656, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5120617151260376, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8740563988685608, + "num_tokens": 502546343.0, + "step": 13166 + }, + { + "epoch": 1.6749777382012465, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.4676127433776855, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8798458576202393, + "num_tokens": 502585490.0, + "step": 13167 + }, + { + "epoch": 1.675104948479837, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.6310336589813232, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8641093969345093, + "num_tokens": 502621028.0, + "step": 13168 + }, + { + "epoch": 1.6752321587584276, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.528566837310791, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.879065752029419, + "num_tokens": 502664301.0, + "step": 13169 + }, + { + "epoch": 1.675359369037018, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5133105516433716, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8712008595466614, + "num_tokens": 502705258.0, + "step": 13170 + }, + { + "epoch": 1.6754865793156086, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5314029455184937, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8779797554016113, + "num_tokens": 502741050.0, + "step": 13171 + }, + { + "epoch": 1.6756137895941992, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5793544054031372, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8706415295600891, + "num_tokens": 502782072.0, + "step": 13172 + }, + { + "epoch": 1.6757409998727897, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.554879903793335, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.877948522567749, + "num_tokens": 502821737.0, + "step": 13173 + }, + { + "epoch": 1.6758682101513802, + "ewc_loss": 2.4199485778808594e-05, + "grad_norm": 1.5806524753570557, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8914929032325745, + "num_tokens": 502859570.0, + "step": 13174 + }, + { + "epoch": 1.6759954204299707, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5751019716262817, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8822633028030396, + "num_tokens": 502892746.0, + "step": 13175 + }, + { + "epoch": 1.6761226307085613, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6102619171142578, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.870235800743103, + "num_tokens": 502929450.0, + "step": 13176 + }, + { + "epoch": 1.6762498409871518, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6524677276611328, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8665626645088196, + "num_tokens": 502967847.0, + "step": 13177 + }, + { + "epoch": 1.6763770512657423, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5345094203948975, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.878511905670166, + "num_tokens": 503006683.0, + "step": 13178 + }, + { + "epoch": 1.6765042615443329, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.3801709413528442, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8867020010948181, + "num_tokens": 503050712.0, + "step": 13179 + }, + { + "epoch": 1.6766314718229234, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4367446899414062, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8784680366516113, + "num_tokens": 503092754.0, + "step": 13180 + }, + { + "epoch": 1.676758682101514, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5693012475967407, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8815938234329224, + "num_tokens": 503130230.0, + "step": 13181 + }, + { + "epoch": 1.6768858923801044, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5877858400344849, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8697466850280762, + "num_tokens": 503169280.0, + "step": 13182 + }, + { + "epoch": 1.677013102658695, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5281121730804443, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8736177682876587, + "num_tokens": 503207397.0, + "step": 13183 + }, + { + "epoch": 1.6771403129372855, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5091229677200317, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8803821802139282, + "num_tokens": 503247405.0, + "step": 13184 + }, + { + "epoch": 1.6772675232158758, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5703269243240356, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.879592776298523, + "num_tokens": 503283584.0, + "step": 13185 + }, + { + "epoch": 1.6773947334944663, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7966419458389282, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8640178442001343, + "num_tokens": 503322752.0, + "step": 13186 + }, + { + "epoch": 1.6775219437730569, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.9366592168807983, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8574509024620056, + "num_tokens": 503353439.0, + "step": 13187 + }, + { + "epoch": 1.6776491540516474, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6363029479980469, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8713279366493225, + "num_tokens": 503387294.0, + "step": 13188 + }, + { + "epoch": 1.677776364330238, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4461537599563599, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8781200647354126, + "num_tokens": 503427297.0, + "step": 13189 + }, + { + "epoch": 1.6779035746088284, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5951322317123413, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8797808289527893, + "num_tokens": 503461231.0, + "step": 13190 + }, + { + "epoch": 1.6780307848874187, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5319033861160278, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8574570417404175, + "num_tokens": 503501897.0, + "step": 13191 + }, + { + "epoch": 1.6781579951660093, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5623949766159058, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8712486028671265, + "num_tokens": 503542091.0, + "step": 13192 + }, + { + "epoch": 1.6782852054445998, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.675184965133667, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.88087397813797, + "num_tokens": 503574644.0, + "step": 13193 + }, + { + "epoch": 1.6784124157231903, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6046160459518433, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8810931444168091, + "num_tokens": 503611685.0, + "step": 13194 + }, + { + "epoch": 1.6785396260017809, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6208029985427856, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8862350583076477, + "num_tokens": 503649228.0, + "step": 13195 + }, + { + "epoch": 1.6786668362803714, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5653921365737915, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8670585751533508, + "num_tokens": 503694481.0, + "step": 13196 + }, + { + "epoch": 1.678794046558962, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5798015594482422, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8742625713348389, + "num_tokens": 503732882.0, + "step": 13197 + }, + { + "epoch": 1.6789212568375524, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.745330810546875, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8581823110580444, + "num_tokens": 503768499.0, + "step": 13198 + }, + { + "epoch": 1.679048467116143, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5326508283615112, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8794830441474915, + "num_tokens": 503806219.0, + "step": 13199 + }, + { + "epoch": 1.6791756773947335, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6420243978500366, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8792887926101685, + "num_tokens": 503846306.0, + "step": 13200 + }, + { + "epoch": 1.679302887673324, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7292643785476685, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8597307801246643, + "num_tokens": 503881315.0, + "step": 13201 + }, + { + "epoch": 1.6794300979519146, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6505285501480103, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8642600774765015, + "num_tokens": 503918043.0, + "step": 13202 + }, + { + "epoch": 1.679557308230505, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5113840103149414, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8669180870056152, + "num_tokens": 503959896.0, + "step": 13203 + }, + { + "epoch": 1.6796845185090956, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8914464712142944, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8672422766685486, + "num_tokens": 503997275.0, + "step": 13204 + }, + { + "epoch": 1.6798117287876861, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6415451765060425, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8897032737731934, + "num_tokens": 504030574.0, + "step": 13205 + }, + { + "epoch": 1.6799389390662767, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6347216367721558, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8715717196464539, + "num_tokens": 504065663.0, + "step": 13206 + }, + { + "epoch": 1.6800661493448672, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.577458381652832, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8668089509010315, + "num_tokens": 504105808.0, + "step": 13207 + }, + { + "epoch": 1.6801933596234577, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5365486145019531, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8704378604888916, + "num_tokens": 504147772.0, + "step": 13208 + }, + { + "epoch": 1.680320569902048, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.617914080619812, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8638363480567932, + "num_tokens": 504186436.0, + "step": 13209 + }, + { + "epoch": 1.6804477801806386, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5726407766342163, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8776894807815552, + "num_tokens": 504227967.0, + "step": 13210 + }, + { + "epoch": 1.680574990459229, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5500106811523438, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8740660548210144, + "num_tokens": 504268591.0, + "step": 13211 + }, + { + "epoch": 1.6807022007378196, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6908578872680664, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8672142028808594, + "num_tokens": 504304774.0, + "step": 13212 + }, + { + "epoch": 1.6808294110164101, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5827211141586304, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8637977838516235, + "num_tokens": 504345288.0, + "step": 13213 + }, + { + "epoch": 1.6809566212950007, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.921379804611206, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8496508002281189, + "num_tokens": 504377741.0, + "step": 13214 + }, + { + "epoch": 1.681083831573591, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6960866451263428, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8734722137451172, + "num_tokens": 504416620.0, + "step": 13215 + }, + { + "epoch": 1.6812110418521815, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.77836012840271, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8670757412910461, + "num_tokens": 504452821.0, + "step": 13216 + }, + { + "epoch": 1.681338252130772, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7243279218673706, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8764390349388123, + "num_tokens": 504488006.0, + "step": 13217 + }, + { + "epoch": 1.6814654624093626, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5399738550186157, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.872424304485321, + "num_tokens": 504534001.0, + "step": 13218 + }, + { + "epoch": 1.681592672687953, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.650004506111145, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8761653900146484, + "num_tokens": 504567691.0, + "step": 13219 + }, + { + "epoch": 1.6817198829665436, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6837090253829956, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8568389415740967, + "num_tokens": 504608672.0, + "step": 13220 + }, + { + "epoch": 1.6818470932451342, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.648782730102539, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8522398471832275, + "num_tokens": 504645923.0, + "step": 13221 + }, + { + "epoch": 1.6819743035237247, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5201871395111084, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8741112351417542, + "num_tokens": 504686651.0, + "step": 13222 + }, + { + "epoch": 1.6821015138023152, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5295612812042236, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8704009056091309, + "num_tokens": 504723977.0, + "step": 13223 + }, + { + "epoch": 1.6822287240809057, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5641536712646484, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8761688470840454, + "num_tokens": 504759230.0, + "step": 13224 + }, + { + "epoch": 1.6823559343594963, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5423221588134766, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8821544647216797, + "num_tokens": 504798254.0, + "step": 13225 + }, + { + "epoch": 1.6824831446380868, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6434718370437622, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8727858066558838, + "num_tokens": 504836744.0, + "step": 13226 + }, + { + "epoch": 1.6826103549166773, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.576136827468872, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8794546127319336, + "num_tokens": 504872916.0, + "step": 13227 + }, + { + "epoch": 1.6827375651952678, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5460941791534424, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8685215711593628, + "num_tokens": 504916913.0, + "step": 13228 + }, + { + "epoch": 1.6828647754738584, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7347105741500854, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8687458038330078, + "num_tokens": 504949389.0, + "step": 13229 + }, + { + "epoch": 1.682991985752449, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7911901473999023, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8593183755874634, + "num_tokens": 504990297.0, + "step": 13230 + }, + { + "epoch": 1.6831191960310394, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.9186893701553345, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8620520830154419, + "num_tokens": 505019824.0, + "step": 13231 + }, + { + "epoch": 1.68324640630963, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.706080675125122, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.862918496131897, + "num_tokens": 505055910.0, + "step": 13232 + }, + { + "epoch": 1.6833736165882205, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.652621865272522, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8700728416442871, + "num_tokens": 505092474.0, + "step": 13233 + }, + { + "epoch": 1.6835008268668108, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7562810182571411, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8634213209152222, + "num_tokens": 505127926.0, + "step": 13234 + }, + { + "epoch": 1.6836280371454013, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5283958911895752, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8934339880943298, + "num_tokens": 505164242.0, + "step": 13235 + }, + { + "epoch": 1.6837552474239919, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5694162845611572, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8733364343643188, + "num_tokens": 505201069.0, + "step": 13236 + }, + { + "epoch": 1.6838824577025824, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8895478248596191, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8577598333358765, + "num_tokens": 505231551.0, + "step": 13237 + }, + { + "epoch": 1.684009667981173, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5182220935821533, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8703463077545166, + "num_tokens": 505271448.0, + "step": 13238 + }, + { + "epoch": 1.6841368782597634, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 2.591945171356201, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8771980404853821, + "num_tokens": 505304371.0, + "step": 13239 + }, + { + "epoch": 1.6842640885383537, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6331164836883545, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8642246723175049, + "num_tokens": 505343368.0, + "step": 13240 + }, + { + "epoch": 1.6843912988169443, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.591689109802246, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8709544539451599, + "num_tokens": 505382913.0, + "step": 13241 + }, + { + "epoch": 1.6845185090955348, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5631060600280762, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8665446043014526, + "num_tokens": 505422632.0, + "step": 13242 + }, + { + "epoch": 1.6846457193741253, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.624210238456726, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8685227632522583, + "num_tokens": 505458728.0, + "step": 13243 + }, + { + "epoch": 1.6847729296527159, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4608644247055054, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8843778371810913, + "num_tokens": 505497388.0, + "step": 13244 + }, + { + "epoch": 1.6849001399313064, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6518083810806274, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.874890923500061, + "num_tokens": 505533479.0, + "step": 13245 + }, + { + "epoch": 1.685027350209897, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6666196584701538, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8718094825744629, + "num_tokens": 505575060.0, + "step": 13246 + }, + { + "epoch": 1.6851545604884874, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6766672134399414, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8772746324539185, + "num_tokens": 505608233.0, + "step": 13247 + }, + { + "epoch": 1.685281770767078, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.70353102684021, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8612790107727051, + "num_tokens": 505644256.0, + "step": 13248 + }, + { + "epoch": 1.6854089810456685, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.795012354850769, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8683521151542664, + "num_tokens": 505678245.0, + "step": 13249 + }, + { + "epoch": 1.685536191324259, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5844515562057495, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8677481412887573, + "num_tokens": 505719725.0, + "step": 13250 + }, + { + "epoch": 1.6856634016028496, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7449333667755127, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8653966784477234, + "num_tokens": 505756792.0, + "step": 13251 + }, + { + "epoch": 1.68579061188144, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5823966264724731, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8744164705276489, + "num_tokens": 505792268.0, + "step": 13252 + }, + { + "epoch": 1.6859178221600306, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6713244915008545, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8802858591079712, + "num_tokens": 505827450.0, + "step": 13253 + }, + { + "epoch": 1.6860450324386211, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.571020245552063, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.865515947341919, + "num_tokens": 505865512.0, + "step": 13254 + }, + { + "epoch": 1.6861722427172117, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.571070909500122, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8782771825790405, + "num_tokens": 505902919.0, + "step": 13255 + }, + { + "epoch": 1.6862994529958022, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6005667448043823, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8745449781417847, + "num_tokens": 505938624.0, + "step": 13256 + }, + { + "epoch": 1.6864266632743927, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5755994319915771, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8685897588729858, + "num_tokens": 505975049.0, + "step": 13257 + }, + { + "epoch": 1.686553873552983, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6434231996536255, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8667774200439453, + "num_tokens": 506015202.0, + "step": 13258 + }, + { + "epoch": 1.6866810838315736, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.815692663192749, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8340712785720825, + "num_tokens": 506055050.0, + "step": 13259 + }, + { + "epoch": 1.686808294110164, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6420557498931885, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8782297968864441, + "num_tokens": 506087936.0, + "step": 13260 + }, + { + "epoch": 1.6869355043887546, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5461052656173706, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8665539026260376, + "num_tokens": 506128656.0, + "step": 13261 + }, + { + "epoch": 1.6870627146673451, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6387883424758911, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8811010718345642, + "num_tokens": 506162608.0, + "step": 13262 + }, + { + "epoch": 1.6871899249459357, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.500563621520996, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8690484762191772, + "num_tokens": 506203428.0, + "step": 13263 + }, + { + "epoch": 1.687317135224526, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5785648822784424, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8596585392951965, + "num_tokens": 506242688.0, + "step": 13264 + }, + { + "epoch": 1.6874443455031165, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5464131832122803, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8695694208145142, + "num_tokens": 506283571.0, + "step": 13265 + }, + { + "epoch": 1.687571555781707, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6661772727966309, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8721100687980652, + "num_tokens": 506316540.0, + "step": 13266 + }, + { + "epoch": 1.6876987660602976, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5887149572372437, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.869306743144989, + "num_tokens": 506356564.0, + "step": 13267 + }, + { + "epoch": 1.687825976338888, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6731269359588623, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8721375465393066, + "num_tokens": 506392738.0, + "step": 13268 + }, + { + "epoch": 1.6879531866174786, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6760954856872559, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8564594984054565, + "num_tokens": 506427383.0, + "step": 13269 + }, + { + "epoch": 1.6880803968960691, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5379060506820679, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8738808631896973, + "num_tokens": 506466842.0, + "step": 13270 + }, + { + "epoch": 1.6882076071746597, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.732535481452942, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8726376295089722, + "num_tokens": 506498758.0, + "step": 13271 + }, + { + "epoch": 1.6883348174532502, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7195351123809814, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.862739086151123, + "num_tokens": 506535871.0, + "step": 13272 + }, + { + "epoch": 1.6884620277318407, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5467441082000732, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8699066042900085, + "num_tokens": 506576894.0, + "step": 13273 + }, + { + "epoch": 1.6885892380104313, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5833439826965332, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8672275543212891, + "num_tokens": 506615745.0, + "step": 13274 + }, + { + "epoch": 1.6887164482890218, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5695558786392212, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8735255002975464, + "num_tokens": 506653299.0, + "step": 13275 + }, + { + "epoch": 1.6888436585676123, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5058960914611816, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8899369239807129, + "num_tokens": 506686861.0, + "step": 13276 + }, + { + "epoch": 1.6889708688462028, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5805023908615112, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8826865553855896, + "num_tokens": 506723499.0, + "step": 13277 + }, + { + "epoch": 1.6890980791247934, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5874519348144531, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8768758177757263, + "num_tokens": 506761423.0, + "step": 13278 + }, + { + "epoch": 1.689225289403384, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6304051876068115, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8759267330169678, + "num_tokens": 506798162.0, + "step": 13279 + }, + { + "epoch": 1.6893524996819744, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5017192363739014, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8748090863227844, + "num_tokens": 506839040.0, + "step": 13280 + }, + { + "epoch": 1.689479709960565, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6579123735427856, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8708582520484924, + "num_tokens": 506873511.0, + "step": 13281 + }, + { + "epoch": 1.6896069202391555, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5671639442443848, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8791459202766418, + "num_tokens": 506913700.0, + "step": 13282 + }, + { + "epoch": 1.6897341305177458, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5574809312820435, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8660010695457458, + "num_tokens": 506956769.0, + "step": 13283 + }, + { + "epoch": 1.6898613407963363, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6146608591079712, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8857042789459229, + "num_tokens": 506989611.0, + "step": 13284 + }, + { + "epoch": 1.6899885510749268, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6233927011489868, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8790864944458008, + "num_tokens": 507029297.0, + "step": 13285 + }, + { + "epoch": 1.6901157613535174, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.637881875038147, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8753042817115784, + "num_tokens": 507067565.0, + "step": 13286 + }, + { + "epoch": 1.690242971632108, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.633711338043213, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8674865961074829, + "num_tokens": 507102233.0, + "step": 13287 + }, + { + "epoch": 1.6903701819106984, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4794824123382568, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.874907374382019, + "num_tokens": 507143326.0, + "step": 13288 + }, + { + "epoch": 1.6904973921892887, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7372995615005493, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8545695543289185, + "num_tokens": 507177723.0, + "step": 13289 + }, + { + "epoch": 1.6906246024678793, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5328748226165771, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8694915771484375, + "num_tokens": 507218408.0, + "step": 13290 + }, + { + "epoch": 1.6907518127464698, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7613774538040161, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.849565327167511, + "num_tokens": 507256214.0, + "step": 13291 + }, + { + "epoch": 1.6908790230250603, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.781554937362671, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8673275113105774, + "num_tokens": 507285837.0, + "step": 13292 + }, + { + "epoch": 1.6910062333036509, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6705435514450073, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8730668425559998, + "num_tokens": 507320136.0, + "step": 13293 + }, + { + "epoch": 1.6911334435822414, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7251397371292114, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.855644941329956, + "num_tokens": 507354581.0, + "step": 13294 + }, + { + "epoch": 1.691260653860832, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5910969972610474, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8810112476348877, + "num_tokens": 507389552.0, + "step": 13295 + }, + { + "epoch": 1.6913878641394224, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4970812797546387, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8789262175559998, + "num_tokens": 507428878.0, + "step": 13296 + }, + { + "epoch": 1.691515074418013, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.531119465827942, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8625346422195435, + "num_tokens": 507471730.0, + "step": 13297 + }, + { + "epoch": 1.6916422846966035, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.556606650352478, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8697137236595154, + "num_tokens": 507516465.0, + "step": 13298 + }, + { + "epoch": 1.691769494975194, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5683926343917847, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8748159408569336, + "num_tokens": 507556475.0, + "step": 13299 + }, + { + "epoch": 1.6918967052537845, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6298489570617676, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8784252405166626, + "num_tokens": 507592264.0, + "step": 13300 + }, + { + "epoch": 1.692023915532375, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4657222032546997, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.879047691822052, + "num_tokens": 507636470.0, + "step": 13301 + }, + { + "epoch": 1.6921511258109656, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5226105451583862, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8761084079742432, + "num_tokens": 507675415.0, + "step": 13302 + }, + { + "epoch": 1.6922783360895561, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6028611660003662, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.869773268699646, + "num_tokens": 507717772.0, + "step": 13303 + }, + { + "epoch": 1.6924055463681467, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.710341215133667, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8774099946022034, + "num_tokens": 507751058.0, + "step": 13304 + }, + { + "epoch": 1.6925327566467372, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5905486345291138, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.863158643245697, + "num_tokens": 507790581.0, + "step": 13305 + }, + { + "epoch": 1.6926599669253277, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6981412172317505, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8743465542793274, + "num_tokens": 507820897.0, + "step": 13306 + }, + { + "epoch": 1.692787177203918, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7646336555480957, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8623651266098022, + "num_tokens": 507851331.0, + "step": 13307 + }, + { + "epoch": 1.6929143874825086, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5568257570266724, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.864332377910614, + "num_tokens": 507887855.0, + "step": 13308 + }, + { + "epoch": 1.693041597761099, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.51456880569458, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.876433253288269, + "num_tokens": 507927089.0, + "step": 13309 + }, + { + "epoch": 1.6931688080396896, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.622766137123108, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8578878045082092, + "num_tokens": 507966031.0, + "step": 13310 + }, + { + "epoch": 1.6932960183182801, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5330091714859009, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8721675276756287, + "num_tokens": 508008980.0, + "step": 13311 + }, + { + "epoch": 1.6934232285968707, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4554998874664307, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8725980520248413, + "num_tokens": 508052314.0, + "step": 13312 + }, + { + "epoch": 1.693550438875461, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5841283798217773, + "learning_rate": 1e-06, + "loss": 0.3313, + "mean_token_accuracy": 0.8908872604370117, + "num_tokens": 508088043.0, + "step": 13313 + }, + { + "epoch": 1.6936776491540515, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4581743478775024, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.882281482219696, + "num_tokens": 508129380.0, + "step": 13314 + }, + { + "epoch": 1.693804859432642, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5639640092849731, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8827682733535767, + "num_tokens": 508163890.0, + "step": 13315 + }, + { + "epoch": 1.6939320697112326, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5243958234786987, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8648138046264648, + "num_tokens": 508205624.0, + "step": 13316 + }, + { + "epoch": 1.694059279989823, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6350994110107422, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.868401288986206, + "num_tokens": 508242360.0, + "step": 13317 + }, + { + "epoch": 1.6941864902684136, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.616696834564209, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8754088282585144, + "num_tokens": 508280258.0, + "step": 13318 + }, + { + "epoch": 1.6943137005470041, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7825514078140259, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8606635928153992, + "num_tokens": 508313765.0, + "step": 13319 + }, + { + "epoch": 1.6944409108255947, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5947134494781494, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8724381327629089, + "num_tokens": 508352965.0, + "step": 13320 + }, + { + "epoch": 1.6945681211041852, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5628691911697388, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8663490414619446, + "num_tokens": 508394353.0, + "step": 13321 + }, + { + "epoch": 1.6946953313827757, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6029765605926514, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8778889179229736, + "num_tokens": 508429855.0, + "step": 13322 + }, + { + "epoch": 1.6948225416613663, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4928791522979736, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8697832822799683, + "num_tokens": 508472886.0, + "step": 13323 + }, + { + "epoch": 1.6949497519399568, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7237823009490967, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8442263603210449, + "num_tokens": 508511537.0, + "step": 13324 + }, + { + "epoch": 1.6950769622185473, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.569957971572876, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8814362287521362, + "num_tokens": 508549069.0, + "step": 13325 + }, + { + "epoch": 1.6952041724971378, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5992459058761597, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8777866959571838, + "num_tokens": 508584506.0, + "step": 13326 + }, + { + "epoch": 1.6953313827757284, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 16.841487884521484, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8869982957839966, + "num_tokens": 508618305.0, + "step": 13327 + }, + { + "epoch": 1.695458593054319, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6621623039245605, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.882531464099884, + "num_tokens": 508650881.0, + "step": 13328 + }, + { + "epoch": 1.6955858033329094, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7810286283493042, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8862266540527344, + "num_tokens": 508685769.0, + "step": 13329 + }, + { + "epoch": 1.6957130136115, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6202821731567383, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8833767175674438, + "num_tokens": 508718953.0, + "step": 13330 + }, + { + "epoch": 1.6958402238900905, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7911738157272339, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8486512303352356, + "num_tokens": 508756941.0, + "step": 13331 + }, + { + "epoch": 1.6959674341686808, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5769058465957642, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8667261004447937, + "num_tokens": 508797569.0, + "step": 13332 + }, + { + "epoch": 1.6960946444472713, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5155123472213745, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8775657415390015, + "num_tokens": 508837181.0, + "step": 13333 + }, + { + "epoch": 1.6962218547258618, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6577272415161133, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8602890968322754, + "num_tokens": 508873519.0, + "step": 13334 + }, + { + "epoch": 1.6963490650044524, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5827265977859497, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8547523021697998, + "num_tokens": 508915738.0, + "step": 13335 + }, + { + "epoch": 1.696476275283043, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8036701679229736, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8630695343017578, + "num_tokens": 508950800.0, + "step": 13336 + }, + { + "epoch": 1.6966034855616334, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.568814992904663, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8713270425796509, + "num_tokens": 508988975.0, + "step": 13337 + }, + { + "epoch": 1.6967306958402237, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5558031797409058, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8780497312545776, + "num_tokens": 509026758.0, + "step": 13338 + }, + { + "epoch": 1.6968579061188143, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5698468685150146, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8796966075897217, + "num_tokens": 509066088.0, + "step": 13339 + }, + { + "epoch": 1.6969851163974048, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7329720258712769, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8722168207168579, + "num_tokens": 509108111.0, + "step": 13340 + }, + { + "epoch": 1.6971123266759953, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5270299911499023, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8731068968772888, + "num_tokens": 509150115.0, + "step": 13341 + }, + { + "epoch": 1.6972395369545858, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6112194061279297, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8753542900085449, + "num_tokens": 509186787.0, + "step": 13342 + }, + { + "epoch": 1.6973667472331764, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4542120695114136, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8710525035858154, + "num_tokens": 509231062.0, + "step": 13343 + }, + { + "epoch": 1.697493957511767, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6615947484970093, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8833463191986084, + "num_tokens": 509265071.0, + "step": 13344 + }, + { + "epoch": 1.6976211677903574, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6512548923492432, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8666457533836365, + "num_tokens": 509302714.0, + "step": 13345 + }, + { + "epoch": 1.697748378068948, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5720773935317993, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8748951554298401, + "num_tokens": 509341094.0, + "step": 13346 + }, + { + "epoch": 1.6978755883475385, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.598233699798584, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8707832098007202, + "num_tokens": 509378845.0, + "step": 13347 + }, + { + "epoch": 1.698002798626129, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8080923557281494, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.869985818862915, + "num_tokens": 509410904.0, + "step": 13348 + }, + { + "epoch": 1.6981300089047195, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6259870529174805, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8807443380355835, + "num_tokens": 509450488.0, + "step": 13349 + }, + { + "epoch": 1.69825721918331, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6840558052062988, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8545244336128235, + "num_tokens": 509490326.0, + "step": 13350 + }, + { + "epoch": 1.6983844294619006, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.462612271308899, + "learning_rate": 1e-06, + "loss": 0.3173, + "mean_token_accuracy": 0.8942654132843018, + "num_tokens": 509529772.0, + "step": 13351 + }, + { + "epoch": 1.6985116397404911, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.579343318939209, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.88264000415802, + "num_tokens": 509563294.0, + "step": 13352 + }, + { + "epoch": 1.6986388500190817, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.544669508934021, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8749358654022217, + "num_tokens": 509598697.0, + "step": 13353 + }, + { + "epoch": 1.6987660602976722, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.522767186164856, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8760796785354614, + "num_tokens": 509634800.0, + "step": 13354 + }, + { + "epoch": 1.6988932705762627, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.545119285583496, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.869218111038208, + "num_tokens": 509676189.0, + "step": 13355 + }, + { + "epoch": 1.699020480854853, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.746675729751587, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8583865761756897, + "num_tokens": 509710400.0, + "step": 13356 + }, + { + "epoch": 1.6991476911334435, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5393157005310059, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.873153567314148, + "num_tokens": 509750328.0, + "step": 13357 + }, + { + "epoch": 1.699274901412034, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6775013208389282, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8595561981201172, + "num_tokens": 509788256.0, + "step": 13358 + }, + { + "epoch": 1.6994021116906246, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4809330701828003, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8873573541641235, + "num_tokens": 509830440.0, + "step": 13359 + }, + { + "epoch": 1.6995293219692151, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7916513681411743, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8812437057495117, + "num_tokens": 509870504.0, + "step": 13360 + }, + { + "epoch": 1.6996565322478057, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6911625862121582, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8675767183303833, + "num_tokens": 509908330.0, + "step": 13361 + }, + { + "epoch": 1.699783742526396, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5069276094436646, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.877831220626831, + "num_tokens": 509950376.0, + "step": 13362 + }, + { + "epoch": 1.6999109528049865, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7137223482131958, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8686417937278748, + "num_tokens": 509987526.0, + "step": 13363 + }, + { + "epoch": 1.700038163083577, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5900218486785889, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8712899684906006, + "num_tokens": 510026853.0, + "step": 13364 + }, + { + "epoch": 1.7001653733621676, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.779022455215454, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8745571374893188, + "num_tokens": 510061455.0, + "step": 13365 + }, + { + "epoch": 1.700292583640758, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.651684045791626, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8632455468177795, + "num_tokens": 510100834.0, + "step": 13366 + }, + { + "epoch": 1.7004197939193486, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7453346252441406, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8652938604354858, + "num_tokens": 510140686.0, + "step": 13367 + }, + { + "epoch": 1.7005470041979391, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6603038311004639, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8713051080703735, + "num_tokens": 510176481.0, + "step": 13368 + }, + { + "epoch": 1.7006742144765297, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.58594810962677, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8802626729011536, + "num_tokens": 510210644.0, + "step": 13369 + }, + { + "epoch": 1.7008014247551202, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6434402465820312, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8578839302062988, + "num_tokens": 510248314.0, + "step": 13370 + }, + { + "epoch": 1.7009286350337107, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5460933446884155, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8745793104171753, + "num_tokens": 510283249.0, + "step": 13371 + }, + { + "epoch": 1.7010558453123013, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.766860008239746, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.861234188079834, + "num_tokens": 510319002.0, + "step": 13372 + }, + { + "epoch": 1.7011830555908918, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.591892957687378, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8688029050827026, + "num_tokens": 510356582.0, + "step": 13373 + }, + { + "epoch": 1.7013102658694823, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5020687580108643, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8593467473983765, + "num_tokens": 510400253.0, + "step": 13374 + }, + { + "epoch": 1.7014374761480728, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4645278453826904, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8739668726921082, + "num_tokens": 510445312.0, + "step": 13375 + }, + { + "epoch": 1.7015646864266634, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5227779150009155, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8795590996742249, + "num_tokens": 510483215.0, + "step": 13376 + }, + { + "epoch": 1.701691896705254, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6161549091339111, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8673160672187805, + "num_tokens": 510524441.0, + "step": 13377 + }, + { + "epoch": 1.7018191069838444, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6966451406478882, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8638945817947388, + "num_tokens": 510564029.0, + "step": 13378 + }, + { + "epoch": 1.701946317262435, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6334035396575928, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8638893365859985, + "num_tokens": 510607686.0, + "step": 13379 + }, + { + "epoch": 1.7020735275410255, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6288487911224365, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8824846744537354, + "num_tokens": 510639752.0, + "step": 13380 + }, + { + "epoch": 1.7022007378196158, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6997787952423096, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.875551700592041, + "num_tokens": 510674486.0, + "step": 13381 + }, + { + "epoch": 1.7023279480982063, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5344974994659424, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8788987994194031, + "num_tokens": 510712036.0, + "step": 13382 + }, + { + "epoch": 1.7024551583767968, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5249677896499634, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8909342885017395, + "num_tokens": 510752366.0, + "step": 13383 + }, + { + "epoch": 1.7025823686553874, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5136399269104004, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8704236745834351, + "num_tokens": 510794032.0, + "step": 13384 + }, + { + "epoch": 1.702709578933978, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6810767650604248, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8750782012939453, + "num_tokens": 510828715.0, + "step": 13385 + }, + { + "epoch": 1.7028367892125684, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8429819345474243, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8631744384765625, + "num_tokens": 510862611.0, + "step": 13386 + }, + { + "epoch": 1.7029639994911587, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7119406461715698, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8660120368003845, + "num_tokens": 510902668.0, + "step": 13387 + }, + { + "epoch": 1.7030912097697493, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6635628938674927, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8731355667114258, + "num_tokens": 510942611.0, + "step": 13388 + }, + { + "epoch": 1.7032184200483398, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5806657075881958, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8882225751876831, + "num_tokens": 510980412.0, + "step": 13389 + }, + { + "epoch": 1.7033456303269303, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6048892736434937, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8635907769203186, + "num_tokens": 511016854.0, + "step": 13390 + }, + { + "epoch": 1.7034728406055208, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6142328977584839, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8755396008491516, + "num_tokens": 511054378.0, + "step": 13391 + }, + { + "epoch": 1.7036000508841114, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5594321489334106, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8685474395751953, + "num_tokens": 511095532.0, + "step": 13392 + }, + { + "epoch": 1.703727261162702, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.550956130027771, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.870474100112915, + "num_tokens": 511133611.0, + "step": 13393 + }, + { + "epoch": 1.7038544714412924, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.598671317100525, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8753538131713867, + "num_tokens": 511170024.0, + "step": 13394 + }, + { + "epoch": 1.703981681719883, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7728381156921387, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8605363368988037, + "num_tokens": 511205219.0, + "step": 13395 + }, + { + "epoch": 1.7041088919984735, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5098241567611694, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8827125430107117, + "num_tokens": 511248575.0, + "step": 13396 + }, + { + "epoch": 1.704236102277064, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7344906330108643, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8628331422805786, + "num_tokens": 511283715.0, + "step": 13397 + }, + { + "epoch": 1.7043633125556545, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6576980352401733, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8756493926048279, + "num_tokens": 511317917.0, + "step": 13398 + }, + { + "epoch": 1.704490522834245, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5509414672851562, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.875775933265686, + "num_tokens": 511358312.0, + "step": 13399 + }, + { + "epoch": 1.7046177331128356, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5394458770751953, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8819488286972046, + "num_tokens": 511397066.0, + "step": 13400 + }, + { + "epoch": 1.7047449433914261, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8124052286148071, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8595820069313049, + "num_tokens": 511431319.0, + "step": 13401 + }, + { + "epoch": 1.7048721536700167, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5993832349777222, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8873889446258545, + "num_tokens": 511465831.0, + "step": 13402 + }, + { + "epoch": 1.7049993639486072, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7391347885131836, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8648735284805298, + "num_tokens": 511499832.0, + "step": 13403 + }, + { + "epoch": 1.7051265742271977, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7324821949005127, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8814593553543091, + "num_tokens": 511538193.0, + "step": 13404 + }, + { + "epoch": 1.705253784505788, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.523769736289978, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8593018054962158, + "num_tokens": 511578397.0, + "step": 13405 + }, + { + "epoch": 1.7053809947843785, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4591790437698364, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8872941732406616, + "num_tokens": 511618515.0, + "step": 13406 + }, + { + "epoch": 1.705508205062969, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5797653198242188, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8646003007888794, + "num_tokens": 511659992.0, + "step": 13407 + }, + { + "epoch": 1.7056354153415596, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6289522647857666, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8581851720809937, + "num_tokens": 511700733.0, + "step": 13408 + }, + { + "epoch": 1.7057626256201501, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8818799257278442, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8461717367172241, + "num_tokens": 511733557.0, + "step": 13409 + }, + { + "epoch": 1.7058898358987407, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6082143783569336, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8776388168334961, + "num_tokens": 511769511.0, + "step": 13410 + }, + { + "epoch": 1.706017046177331, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6052948236465454, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8633393049240112, + "num_tokens": 511805340.0, + "step": 13411 + }, + { + "epoch": 1.7061442564559215, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.634428858757019, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8724478483200073, + "num_tokens": 511839760.0, + "step": 13412 + }, + { + "epoch": 1.706271466734512, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6398049592971802, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8616678714752197, + "num_tokens": 511879400.0, + "step": 13413 + }, + { + "epoch": 1.7063986770131025, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7130922079086304, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8784574866294861, + "num_tokens": 511914857.0, + "step": 13414 + }, + { + "epoch": 1.706525887291693, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7171602249145508, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8777498602867126, + "num_tokens": 511948521.0, + "step": 13415 + }, + { + "epoch": 1.7066530975702836, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7721673250198364, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8903526067733765, + "num_tokens": 511979390.0, + "step": 13416 + }, + { + "epoch": 1.7067803078488741, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8262954950332642, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.867375373840332, + "num_tokens": 512014973.0, + "step": 13417 + }, + { + "epoch": 1.7069075181274647, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7010645866394043, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8563300371170044, + "num_tokens": 512051482.0, + "step": 13418 + }, + { + "epoch": 1.7070347284060552, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7218753099441528, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.871802568435669, + "num_tokens": 512081519.0, + "step": 13419 + }, + { + "epoch": 1.7071619386846457, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5375804901123047, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8677853345870972, + "num_tokens": 512121182.0, + "step": 13420 + }, + { + "epoch": 1.7072891489632362, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4902710914611816, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8726578950881958, + "num_tokens": 512161841.0, + "step": 13421 + }, + { + "epoch": 1.7074163592418268, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6420177221298218, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8715778589248657, + "num_tokens": 512197618.0, + "step": 13422 + }, + { + "epoch": 1.7075435695204173, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5651828050613403, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8768659830093384, + "num_tokens": 512233357.0, + "step": 13423 + }, + { + "epoch": 1.7076707797990078, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6235851049423218, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8714032173156738, + "num_tokens": 512269052.0, + "step": 13424 + }, + { + "epoch": 1.7077979900775984, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.49454665184021, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8707245588302612, + "num_tokens": 512306789.0, + "step": 13425 + }, + { + "epoch": 1.7079252003561889, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.608414888381958, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8796781301498413, + "num_tokens": 512341877.0, + "step": 13426 + }, + { + "epoch": 1.7080524106347794, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6438605785369873, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8665154576301575, + "num_tokens": 512382279.0, + "step": 13427 + }, + { + "epoch": 1.70817962091337, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5905215740203857, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8759803771972656, + "num_tokens": 512421006.0, + "step": 13428 + }, + { + "epoch": 1.7083068311919605, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5914455652236938, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8815613985061646, + "num_tokens": 512459894.0, + "step": 13429 + }, + { + "epoch": 1.7084340414705508, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7508128881454468, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8683772087097168, + "num_tokens": 512489747.0, + "step": 13430 + }, + { + "epoch": 1.7085612517491413, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6199960708618164, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8892017602920532, + "num_tokens": 512521639.0, + "step": 13431 + }, + { + "epoch": 1.7086884620277318, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6192679405212402, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.876404881477356, + "num_tokens": 512566068.0, + "step": 13432 + }, + { + "epoch": 1.7088156723063224, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7181552648544312, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.869644284248352, + "num_tokens": 512603729.0, + "step": 13433 + }, + { + "epoch": 1.708942882584913, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7107353210449219, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.868442952632904, + "num_tokens": 512643633.0, + "step": 13434 + }, + { + "epoch": 1.7090700928635034, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5232675075531006, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8849378228187561, + "num_tokens": 512682255.0, + "step": 13435 + }, + { + "epoch": 1.7091973031420937, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.501688838005066, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8614100813865662, + "num_tokens": 512725812.0, + "step": 13436 + }, + { + "epoch": 1.7093245134206843, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.620663046836853, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8679033517837524, + "num_tokens": 512762557.0, + "step": 13437 + }, + { + "epoch": 1.7094517236992748, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4711865186691284, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8733053803443909, + "num_tokens": 512805794.0, + "step": 13438 + }, + { + "epoch": 1.7095789339778653, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5611181259155273, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8850219249725342, + "num_tokens": 512843155.0, + "step": 13439 + }, + { + "epoch": 1.7097061442564558, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5961920022964478, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8637927770614624, + "num_tokens": 512882622.0, + "step": 13440 + }, + { + "epoch": 1.7098333545350464, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5435973405838013, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8721554279327393, + "num_tokens": 512922098.0, + "step": 13441 + }, + { + "epoch": 1.709960564813637, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5388222932815552, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.863431453704834, + "num_tokens": 512961372.0, + "step": 13442 + }, + { + "epoch": 1.7100877750922274, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.548508644104004, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8730587363243103, + "num_tokens": 513003108.0, + "step": 13443 + }, + { + "epoch": 1.710214985370818, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8035415410995483, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8573520183563232, + "num_tokens": 513037916.0, + "step": 13444 + }, + { + "epoch": 1.7103421956494085, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7354633808135986, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8906068801879883, + "num_tokens": 513068105.0, + "step": 13445 + }, + { + "epoch": 1.710469405927999, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7494592666625977, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8626523017883301, + "num_tokens": 513103212.0, + "step": 13446 + }, + { + "epoch": 1.7105966162065895, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6453399658203125, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8782954216003418, + "num_tokens": 513144682.0, + "step": 13447 + }, + { + "epoch": 1.71072382648518, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5050902366638184, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8944672346115112, + "num_tokens": 513179557.0, + "step": 13448 + }, + { + "epoch": 1.7108510367637706, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4949053525924683, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8778189420700073, + "num_tokens": 513221124.0, + "step": 13449 + }, + { + "epoch": 1.7109782470423611, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.3658146858215332, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8773380517959595, + "num_tokens": 513269673.0, + "step": 13450 + }, + { + "epoch": 1.7111054573209517, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.650554895401001, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8779895305633545, + "num_tokens": 513307847.0, + "step": 13451 + }, + { + "epoch": 1.7112326675995422, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6607873439788818, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.859767735004425, + "num_tokens": 513344729.0, + "step": 13452 + }, + { + "epoch": 1.7113598778781327, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4848452806472778, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8868662714958191, + "num_tokens": 513385211.0, + "step": 13453 + }, + { + "epoch": 1.711487088156723, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7178654670715332, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8731222152709961, + "num_tokens": 513419472.0, + "step": 13454 + }, + { + "epoch": 1.7116142984353135, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4604384899139404, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8705264925956726, + "num_tokens": 513459672.0, + "step": 13455 + }, + { + "epoch": 1.711741508713904, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.566815972328186, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8744076490402222, + "num_tokens": 513495344.0, + "step": 13456 + }, + { + "epoch": 1.7118687189924946, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6026673316955566, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8711378574371338, + "num_tokens": 513535034.0, + "step": 13457 + }, + { + "epoch": 1.7119959292710851, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6124334335327148, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8657674789428711, + "num_tokens": 513572037.0, + "step": 13458 + }, + { + "epoch": 1.7121231395496757, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.624610185623169, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8535360097885132, + "num_tokens": 513615150.0, + "step": 13459 + }, + { + "epoch": 1.712250349828266, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5043764114379883, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8729133009910583, + "num_tokens": 513659656.0, + "step": 13460 + }, + { + "epoch": 1.7123775601068565, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6049998998641968, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8660091161727905, + "num_tokens": 513695916.0, + "step": 13461 + }, + { + "epoch": 1.712504770385447, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6085344552993774, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.85650634765625, + "num_tokens": 513735425.0, + "step": 13462 + }, + { + "epoch": 1.7126319806640375, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5011630058288574, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8660227060317993, + "num_tokens": 513779118.0, + "step": 13463 + }, + { + "epoch": 1.712759190942628, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.712311029434204, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8790292739868164, + "num_tokens": 513811184.0, + "step": 13464 + }, + { + "epoch": 1.7128864012212186, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6574630737304688, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8793209791183472, + "num_tokens": 513847022.0, + "step": 13465 + }, + { + "epoch": 1.7130136114998091, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6894406080245972, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8588806390762329, + "num_tokens": 513884595.0, + "step": 13466 + }, + { + "epoch": 1.7131408217783997, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6391643285751343, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.877625584602356, + "num_tokens": 513920209.0, + "step": 13467 + }, + { + "epoch": 1.7132680320569902, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6543644666671753, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.876075804233551, + "num_tokens": 513956122.0, + "step": 13468 + }, + { + "epoch": 1.7133952423355807, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.590327262878418, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8676634430885315, + "num_tokens": 513996755.0, + "step": 13469 + }, + { + "epoch": 1.7135224526141712, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7979464530944824, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8668464422225952, + "num_tokens": 514036615.0, + "step": 13470 + }, + { + "epoch": 1.7136496628927618, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6120725870132446, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.873943567276001, + "num_tokens": 514075662.0, + "step": 13471 + }, + { + "epoch": 1.7137768731713523, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5828046798706055, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8869758248329163, + "num_tokens": 514112908.0, + "step": 13472 + }, + { + "epoch": 1.7139040834499428, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.580771803855896, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8615106344223022, + "num_tokens": 514152700.0, + "step": 13473 + }, + { + "epoch": 1.7140312937285334, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.4758167266845703, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8859115839004517, + "num_tokens": 514188065.0, + "step": 13474 + }, + { + "epoch": 1.7141585040071239, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7432518005371094, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.872431755065918, + "num_tokens": 514222969.0, + "step": 13475 + }, + { + "epoch": 1.7142857142857144, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6555322408676147, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8674657940864563, + "num_tokens": 514260315.0, + "step": 13476 + }, + { + "epoch": 1.714412924564305, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.652721643447876, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.87709641456604, + "num_tokens": 514295013.0, + "step": 13477 + }, + { + "epoch": 1.7145401348428955, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5074020624160767, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8803759813308716, + "num_tokens": 514331892.0, + "step": 13478 + }, + { + "epoch": 1.7146673451214858, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5848324298858643, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8739233613014221, + "num_tokens": 514367875.0, + "step": 13479 + }, + { + "epoch": 1.7147945554000763, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.692342758178711, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8728363513946533, + "num_tokens": 514403878.0, + "step": 13480 + }, + { + "epoch": 1.7149217656786668, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5583996772766113, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8515262007713318, + "num_tokens": 514448173.0, + "step": 13481 + }, + { + "epoch": 1.7150489759572574, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6452118158340454, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8627737164497375, + "num_tokens": 514488005.0, + "step": 13482 + }, + { + "epoch": 1.7151761862358479, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6895549297332764, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8667237758636475, + "num_tokens": 514526910.0, + "step": 13483 + }, + { + "epoch": 1.7153033965144384, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6454867124557495, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8698152303695679, + "num_tokens": 514569026.0, + "step": 13484 + }, + { + "epoch": 1.7154306067930287, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.595205307006836, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8673611879348755, + "num_tokens": 514610300.0, + "step": 13485 + }, + { + "epoch": 1.7155578170716193, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5719960927963257, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8784877061843872, + "num_tokens": 514645280.0, + "step": 13486 + }, + { + "epoch": 1.7156850273502098, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7224117517471313, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8788236379623413, + "num_tokens": 514679882.0, + "step": 13487 + }, + { + "epoch": 1.7158122376288003, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5914660692214966, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8716630935668945, + "num_tokens": 514718641.0, + "step": 13488 + }, + { + "epoch": 1.7159394479073908, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5846388339996338, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8856555223464966, + "num_tokens": 514755115.0, + "step": 13489 + }, + { + "epoch": 1.7160666581859814, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7187676429748535, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8780114650726318, + "num_tokens": 514787966.0, + "step": 13490 + }, + { + "epoch": 1.716193868464572, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7342463731765747, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8712261915206909, + "num_tokens": 514818679.0, + "step": 13491 + }, + { + "epoch": 1.7163210787431624, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6952401399612427, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8423728942871094, + "num_tokens": 514857454.0, + "step": 13492 + }, + { + "epoch": 1.716448289021753, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5404480695724487, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.877665638923645, + "num_tokens": 514898312.0, + "step": 13493 + }, + { + "epoch": 1.7165754993003435, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5533711910247803, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8657065033912659, + "num_tokens": 514935151.0, + "step": 13494 + }, + { + "epoch": 1.716702709578934, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5753260850906372, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.872162938117981, + "num_tokens": 514975352.0, + "step": 13495 + }, + { + "epoch": 1.7168299198575245, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.8202794790267944, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8721181154251099, + "num_tokens": 515007513.0, + "step": 13496 + }, + { + "epoch": 1.716957130136115, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5433847904205322, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8796120882034302, + "num_tokens": 515044440.0, + "step": 13497 + }, + { + "epoch": 1.7170843404147056, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5137386322021484, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8709333539009094, + "num_tokens": 515083755.0, + "step": 13498 + }, + { + "epoch": 1.7172115506932961, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5326040983200073, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.875192403793335, + "num_tokens": 515122301.0, + "step": 13499 + }, + { + "epoch": 1.7173387609718866, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7083275318145752, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8586111068725586, + "num_tokens": 515161373.0, + "step": 13500 + }, + { + "epoch": 1.7174659712504772, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.637904405593872, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.871505618095398, + "num_tokens": 515196345.0, + "step": 13501 + }, + { + "epoch": 1.7175931815290677, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6354674100875854, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8909565210342407, + "num_tokens": 515231282.0, + "step": 13502 + }, + { + "epoch": 1.717720391807658, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5264214277267456, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8725812435150146, + "num_tokens": 515268733.0, + "step": 13503 + }, + { + "epoch": 1.7178476020862485, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5553104877471924, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8717875480651855, + "num_tokens": 515309100.0, + "step": 13504 + }, + { + "epoch": 1.717974812364839, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.8370059728622437, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8758997321128845, + "num_tokens": 515342018.0, + "step": 13505 + }, + { + "epoch": 1.7181020226434296, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7932459115982056, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8897162675857544, + "num_tokens": 515373825.0, + "step": 13506 + }, + { + "epoch": 1.7182292329220201, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 2.2554447650909424, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8726101517677307, + "num_tokens": 515410986.0, + "step": 13507 + }, + { + "epoch": 1.7183564432006107, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6988472938537598, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.870205283164978, + "num_tokens": 515449730.0, + "step": 13508 + }, + { + "epoch": 1.718483653479201, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.769922137260437, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8453205823898315, + "num_tokens": 515487654.0, + "step": 13509 + }, + { + "epoch": 1.7186108637577915, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6643364429473877, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8617361187934875, + "num_tokens": 515524550.0, + "step": 13510 + }, + { + "epoch": 1.718738074036382, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.8135205507278442, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8765647411346436, + "num_tokens": 515563291.0, + "step": 13511 + }, + { + "epoch": 1.7188652843149725, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.9436074495315552, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8731511235237122, + "num_tokens": 515599583.0, + "step": 13512 + }, + { + "epoch": 1.718992494593563, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6542911529541016, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8529622554779053, + "num_tokens": 515637328.0, + "step": 13513 + }, + { + "epoch": 1.7191197048721536, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7103550434112549, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.850408673286438, + "num_tokens": 515675549.0, + "step": 13514 + }, + { + "epoch": 1.7192469151507441, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4611232280731201, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8559417724609375, + "num_tokens": 515719310.0, + "step": 13515 + }, + { + "epoch": 1.7193741254293347, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7746601104736328, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8656336069107056, + "num_tokens": 515752434.0, + "step": 13516 + }, + { + "epoch": 1.7195013357079252, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.704223394393921, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8860737085342407, + "num_tokens": 515787710.0, + "step": 13517 + }, + { + "epoch": 1.7196285459865157, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 2.284593105316162, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8590388298034668, + "num_tokens": 515830285.0, + "step": 13518 + }, + { + "epoch": 1.7197557562651062, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7595088481903076, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8783381581306458, + "num_tokens": 515862687.0, + "step": 13519 + }, + { + "epoch": 1.7198829665436968, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4697555303573608, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.873793363571167, + "num_tokens": 515908468.0, + "step": 13520 + }, + { + "epoch": 1.7200101768222873, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5463231801986694, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8747293949127197, + "num_tokens": 515944765.0, + "step": 13521 + }, + { + "epoch": 1.7201373871008778, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5260276794433594, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8734204769134521, + "num_tokens": 515988005.0, + "step": 13522 + }, + { + "epoch": 1.7202645973794684, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.732111930847168, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.868726909160614, + "num_tokens": 516026198.0, + "step": 13523 + }, + { + "epoch": 1.7203918076580589, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.6433717012405396, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8561493158340454, + "num_tokens": 516062340.0, + "step": 13524 + }, + { + "epoch": 1.7205190179366494, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.5572916269302368, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8812294006347656, + "num_tokens": 516096868.0, + "step": 13525 + }, + { + "epoch": 1.72064622821524, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.686904788017273, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8755673170089722, + "num_tokens": 516131238.0, + "step": 13526 + }, + { + "epoch": 1.7207734384938305, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6564284563064575, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8682649731636047, + "num_tokens": 516168918.0, + "step": 13527 + }, + { + "epoch": 1.7209006487724208, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7157013416290283, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8801612854003906, + "num_tokens": 516200385.0, + "step": 13528 + }, + { + "epoch": 1.7210278590510113, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 2.0466198921203613, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.852141797542572, + "num_tokens": 516227887.0, + "step": 13529 + }, + { + "epoch": 1.7211550693296018, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.9161101579666138, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8845654129981995, + "num_tokens": 516270148.0, + "step": 13530 + }, + { + "epoch": 1.7212822796081924, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6636253595352173, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8664751052856445, + "num_tokens": 516304751.0, + "step": 13531 + }, + { + "epoch": 1.7214094898867829, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.556031346321106, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8850218653678894, + "num_tokens": 516340745.0, + "step": 13532 + }, + { + "epoch": 1.7215367001653734, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.550018548965454, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8758589625358582, + "num_tokens": 516378022.0, + "step": 13533 + }, + { + "epoch": 1.7216639104439637, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7754641771316528, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8696045279502869, + "num_tokens": 516414669.0, + "step": 13534 + }, + { + "epoch": 1.7217911207225542, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7846252918243408, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8712267875671387, + "num_tokens": 516446595.0, + "step": 13535 + }, + { + "epoch": 1.7219183310011448, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6303595304489136, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8899773359298706, + "num_tokens": 516481160.0, + "step": 13536 + }, + { + "epoch": 1.7220455412797353, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6212340593338013, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8551722764968872, + "num_tokens": 516521000.0, + "step": 13537 + }, + { + "epoch": 1.7221727515583258, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5196609497070312, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8803275227546692, + "num_tokens": 516558991.0, + "step": 13538 + }, + { + "epoch": 1.7222999618369164, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5970946550369263, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8775272369384766, + "num_tokens": 516595352.0, + "step": 13539 + }, + { + "epoch": 1.7224271721155069, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5793631076812744, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8729826211929321, + "num_tokens": 516635100.0, + "step": 13540 + }, + { + "epoch": 1.7225543823940974, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7284148931503296, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8843916058540344, + "num_tokens": 516668474.0, + "step": 13541 + }, + { + "epoch": 1.722681592672688, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5916402339935303, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8803659081459045, + "num_tokens": 516708909.0, + "step": 13542 + }, + { + "epoch": 1.7228088029512785, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5915254354476929, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8743432760238647, + "num_tokens": 516746760.0, + "step": 13543 + }, + { + "epoch": 1.722936013229869, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4317867755889893, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8866977691650391, + "num_tokens": 516788997.0, + "step": 13544 + }, + { + "epoch": 1.7230632235084595, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5326149463653564, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8639469742774963, + "num_tokens": 516830213.0, + "step": 13545 + }, + { + "epoch": 1.72319043378705, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4222021102905273, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8665676116943359, + "num_tokens": 516875539.0, + "step": 13546 + }, + { + "epoch": 1.7233176440656406, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.704345941543579, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8789665102958679, + "num_tokens": 516905794.0, + "step": 13547 + }, + { + "epoch": 1.7234448543442311, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5987699031829834, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8612583875656128, + "num_tokens": 516945835.0, + "step": 13548 + }, + { + "epoch": 1.7235720646228216, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6954787969589233, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8649777173995972, + "num_tokens": 516987746.0, + "step": 13549 + }, + { + "epoch": 1.7236992749014122, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5791507959365845, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8534213900566101, + "num_tokens": 517034185.0, + "step": 13550 + }, + { + "epoch": 1.7238264851800027, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.8008675575256348, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8721904754638672, + "num_tokens": 517071756.0, + "step": 13551 + }, + { + "epoch": 1.723953695458593, + "ewc_loss": 2.4318695068359375e-05, + "grad_norm": 1.7044068574905396, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8634423613548279, + "num_tokens": 517105872.0, + "step": 13552 + }, + { + "epoch": 1.7240809057371835, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5602504014968872, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.861078143119812, + "num_tokens": 517146266.0, + "step": 13553 + }, + { + "epoch": 1.724208116015774, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.573582410812378, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8691879510879517, + "num_tokens": 517185954.0, + "step": 13554 + }, + { + "epoch": 1.7243353262943646, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7469334602355957, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8630039691925049, + "num_tokens": 517218455.0, + "step": 13555 + }, + { + "epoch": 1.7244625365729551, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5580763816833496, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8590563535690308, + "num_tokens": 517257590.0, + "step": 13556 + }, + { + "epoch": 1.7245897468515456, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5107067823410034, + "learning_rate": 1e-06, + "loss": 0.3193, + "mean_token_accuracy": 0.8942724466323853, + "num_tokens": 517297361.0, + "step": 13557 + }, + { + "epoch": 1.724716957130136, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7584271430969238, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.866522490978241, + "num_tokens": 517328234.0, + "step": 13558 + }, + { + "epoch": 1.7248441674087265, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.638189435005188, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8620837926864624, + "num_tokens": 517370650.0, + "step": 13559 + }, + { + "epoch": 1.724971377687317, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5916084051132202, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8718332052230835, + "num_tokens": 517406554.0, + "step": 13560 + }, + { + "epoch": 1.7250985879659075, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.74504816532135, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8669768571853638, + "num_tokens": 517446586.0, + "step": 13561 + }, + { + "epoch": 1.725225798244498, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.576953411102295, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8790603280067444, + "num_tokens": 517481382.0, + "step": 13562 + }, + { + "epoch": 1.7253530085230886, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4760044813156128, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8896914720535278, + "num_tokens": 517521440.0, + "step": 13563 + }, + { + "epoch": 1.7254802188016791, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7345995903015137, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8819748163223267, + "num_tokens": 517554717.0, + "step": 13564 + }, + { + "epoch": 1.7256074290802697, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5030499696731567, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.877007246017456, + "num_tokens": 517596301.0, + "step": 13565 + }, + { + "epoch": 1.7257346393588602, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6239944696426392, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8763532042503357, + "num_tokens": 517633441.0, + "step": 13566 + }, + { + "epoch": 1.7258618496374507, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6882901191711426, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8680993318557739, + "num_tokens": 517667523.0, + "step": 13567 + }, + { + "epoch": 1.7259890599160412, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.704283356666565, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8566440343856812, + "num_tokens": 517702607.0, + "step": 13568 + }, + { + "epoch": 1.7261162701946318, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5685150623321533, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8794085383415222, + "num_tokens": 517741503.0, + "step": 13569 + }, + { + "epoch": 1.7262434804732223, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5948714017868042, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8618991374969482, + "num_tokens": 517783762.0, + "step": 13570 + }, + { + "epoch": 1.7263706907518128, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.640753984451294, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8775558471679688, + "num_tokens": 517820822.0, + "step": 13571 + }, + { + "epoch": 1.7264979010304033, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5184980630874634, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8703106045722961, + "num_tokens": 517861840.0, + "step": 13572 + }, + { + "epoch": 1.7266251113089939, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.652765154838562, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8750730752944946, + "num_tokens": 517895292.0, + "step": 13573 + }, + { + "epoch": 1.7267523215875844, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5714967250823975, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8786652088165283, + "num_tokens": 517935857.0, + "step": 13574 + }, + { + "epoch": 1.726879531866175, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.566977620124817, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8828229904174805, + "num_tokens": 517972170.0, + "step": 13575 + }, + { + "epoch": 1.7270067421447655, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5892109870910645, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8796204328536987, + "num_tokens": 518006577.0, + "step": 13576 + }, + { + "epoch": 1.7271339524233558, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5520076751708984, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8757579326629639, + "num_tokens": 518044834.0, + "step": 13577 + }, + { + "epoch": 1.7272611627019463, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8372160196304321, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.874413013458252, + "num_tokens": 518076790.0, + "step": 13578 + }, + { + "epoch": 1.7273883729805368, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.519677758216858, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8732345700263977, + "num_tokens": 518117558.0, + "step": 13579 + }, + { + "epoch": 1.7275155832591274, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4695439338684082, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8749125003814697, + "num_tokens": 518162247.0, + "step": 13580 + }, + { + "epoch": 1.7276427935377179, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6419644355773926, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8698270916938782, + "num_tokens": 518197004.0, + "step": 13581 + }, + { + "epoch": 1.7277700038163084, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.622733473777771, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8820071220397949, + "num_tokens": 518233751.0, + "step": 13582 + }, + { + "epoch": 1.7278972140948987, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5906767845153809, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8717687129974365, + "num_tokens": 518277836.0, + "step": 13583 + }, + { + "epoch": 1.7280244243734892, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7000535726547241, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8740590810775757, + "num_tokens": 518311779.0, + "step": 13584 + }, + { + "epoch": 1.7281516346520798, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5198924541473389, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8741499185562134, + "num_tokens": 518350557.0, + "step": 13585 + }, + { + "epoch": 1.7282788449306703, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6579971313476562, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8744059801101685, + "num_tokens": 518385014.0, + "step": 13586 + }, + { + "epoch": 1.7284060552092608, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5455683469772339, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8576512336730957, + "num_tokens": 518425513.0, + "step": 13587 + }, + { + "epoch": 1.7285332654878514, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.640351414680481, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8784741759300232, + "num_tokens": 518460376.0, + "step": 13588 + }, + { + "epoch": 1.7286604757664419, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7213523387908936, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.87593013048172, + "num_tokens": 518498179.0, + "step": 13589 + }, + { + "epoch": 1.7287876860450324, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6413480043411255, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8698605298995972, + "num_tokens": 518539339.0, + "step": 13590 + }, + { + "epoch": 1.728914896323623, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.642972469329834, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8782538771629333, + "num_tokens": 518572258.0, + "step": 13591 + }, + { + "epoch": 1.7290421066022135, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6073192358016968, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8679221868515015, + "num_tokens": 518617221.0, + "step": 13592 + }, + { + "epoch": 1.729169316880804, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5634913444519043, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.879132091999054, + "num_tokens": 518659125.0, + "step": 13593 + }, + { + "epoch": 1.7292965271593945, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7100807428359985, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8667405843734741, + "num_tokens": 518695814.0, + "step": 13594 + }, + { + "epoch": 1.729423737437985, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6567879915237427, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.868867039680481, + "num_tokens": 518729263.0, + "step": 13595 + }, + { + "epoch": 1.7295509477165756, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5628583431243896, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8768106698989868, + "num_tokens": 518766190.0, + "step": 13596 + }, + { + "epoch": 1.729678157995166, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.538895845413208, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8766905665397644, + "num_tokens": 518806149.0, + "step": 13597 + }, + { + "epoch": 1.7298053682737566, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.571580410003662, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8720939755439758, + "num_tokens": 518846920.0, + "step": 13598 + }, + { + "epoch": 1.7299325785523472, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6182841062545776, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8756976127624512, + "num_tokens": 518885431.0, + "step": 13599 + }, + { + "epoch": 1.7300597888309377, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5745854377746582, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8721266984939575, + "num_tokens": 518920713.0, + "step": 13600 + }, + { + "epoch": 1.730186999109528, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5308241844177246, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8777207136154175, + "num_tokens": 518959940.0, + "step": 13601 + }, + { + "epoch": 1.7303142093881185, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5333459377288818, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8782448768615723, + "num_tokens": 518999381.0, + "step": 13602 + }, + { + "epoch": 1.730441419666709, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6156548261642456, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8785921931266785, + "num_tokens": 519037723.0, + "step": 13603 + }, + { + "epoch": 1.7305686299452996, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.512850284576416, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8799960613250732, + "num_tokens": 519076328.0, + "step": 13604 + }, + { + "epoch": 1.7306958402238901, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4443693161010742, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8744175434112549, + "num_tokens": 519118200.0, + "step": 13605 + }, + { + "epoch": 1.7308230505024806, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.589457631111145, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8655146360397339, + "num_tokens": 519162078.0, + "step": 13606 + }, + { + "epoch": 1.730950260781071, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6117287874221802, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8684290647506714, + "num_tokens": 519199964.0, + "step": 13607 + }, + { + "epoch": 1.7310774710596615, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5170429944992065, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8828339576721191, + "num_tokens": 519240419.0, + "step": 13608 + }, + { + "epoch": 1.731204681338252, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7149866819381714, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8717688322067261, + "num_tokens": 519279681.0, + "step": 13609 + }, + { + "epoch": 1.7313318916168425, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5348472595214844, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8701218366622925, + "num_tokens": 519321780.0, + "step": 13610 + }, + { + "epoch": 1.731459101895433, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.70201575756073, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8716416954994202, + "num_tokens": 519359983.0, + "step": 13611 + }, + { + "epoch": 1.7315863121740236, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.516222596168518, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8735841512680054, + "num_tokens": 519399741.0, + "step": 13612 + }, + { + "epoch": 1.7317135224526141, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6825393438339233, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8799272179603577, + "num_tokens": 519433708.0, + "step": 13613 + }, + { + "epoch": 1.7318407327312046, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.620047926902771, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8682866096496582, + "num_tokens": 519473827.0, + "step": 13614 + }, + { + "epoch": 1.7319679430097952, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6271097660064697, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8803548812866211, + "num_tokens": 519508289.0, + "step": 13615 + }, + { + "epoch": 1.7320951532883857, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6492527723312378, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.86647629737854, + "num_tokens": 519548259.0, + "step": 13616 + }, + { + "epoch": 1.7322223635669762, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6059499979019165, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8817837238311768, + "num_tokens": 519584413.0, + "step": 13617 + }, + { + "epoch": 1.7323495738455668, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6808786392211914, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8676570653915405, + "num_tokens": 519628161.0, + "step": 13618 + }, + { + "epoch": 1.7324767841241573, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6490226984024048, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8670127391815186, + "num_tokens": 519667148.0, + "step": 13619 + }, + { + "epoch": 1.7326039944027478, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6745147705078125, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8650806546211243, + "num_tokens": 519705308.0, + "step": 13620 + }, + { + "epoch": 1.7327312046813383, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4668500423431396, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8686329126358032, + "num_tokens": 519748244.0, + "step": 13621 + }, + { + "epoch": 1.7328584149599289, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.778906226158142, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8635114431381226, + "num_tokens": 519783072.0, + "step": 13622 + }, + { + "epoch": 1.7329856252385194, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.609565019607544, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8713886737823486, + "num_tokens": 519825050.0, + "step": 13623 + }, + { + "epoch": 1.73311283551711, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7057656049728394, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8630163669586182, + "num_tokens": 519860687.0, + "step": 13624 + }, + { + "epoch": 1.7332400457957005, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6379272937774658, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8817192316055298, + "num_tokens": 519894019.0, + "step": 13625 + }, + { + "epoch": 1.7333672560742908, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5694903135299683, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.880287230014801, + "num_tokens": 519931040.0, + "step": 13626 + }, + { + "epoch": 1.7334944663528813, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.544277310371399, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.866644024848938, + "num_tokens": 519971311.0, + "step": 13627 + }, + { + "epoch": 1.7336216766314718, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6678922176361084, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.874060332775116, + "num_tokens": 520007185.0, + "step": 13628 + }, + { + "epoch": 1.7337488869100623, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6429884433746338, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8558804392814636, + "num_tokens": 520047889.0, + "step": 13629 + }, + { + "epoch": 1.7338760971886529, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5777887105941772, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8804153203964233, + "num_tokens": 520085165.0, + "step": 13630 + }, + { + "epoch": 1.7340033074672434, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6545062065124512, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8660922646522522, + "num_tokens": 520125280.0, + "step": 13631 + }, + { + "epoch": 1.7341305177458337, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6552404165267944, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8488762378692627, + "num_tokens": 520161658.0, + "step": 13632 + }, + { + "epoch": 1.7342577280244242, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.733078122138977, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8591992855072021, + "num_tokens": 520197130.0, + "step": 13633 + }, + { + "epoch": 1.7343849383030148, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6148173809051514, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8640973567962646, + "num_tokens": 520236190.0, + "step": 13634 + }, + { + "epoch": 1.7345121485816053, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5879100561141968, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8777175545692444, + "num_tokens": 520273580.0, + "step": 13635 + }, + { + "epoch": 1.7346393588601958, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5368375778198242, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8713710308074951, + "num_tokens": 520316424.0, + "step": 13636 + }, + { + "epoch": 1.7347665691387864, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7385177612304688, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8703134655952454, + "num_tokens": 520352870.0, + "step": 13637 + }, + { + "epoch": 1.7348937794173769, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5318773984909058, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8900505900382996, + "num_tokens": 520387724.0, + "step": 13638 + }, + { + "epoch": 1.7350209896959674, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5417475700378418, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8705412149429321, + "num_tokens": 520432885.0, + "step": 13639 + }, + { + "epoch": 1.735148199974558, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7063201665878296, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8577351570129395, + "num_tokens": 520468752.0, + "step": 13640 + }, + { + "epoch": 1.7352754102531485, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.8343788385391235, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8683420419692993, + "num_tokens": 520504687.0, + "step": 13641 + }, + { + "epoch": 1.735402620531739, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.718703269958496, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8664823770523071, + "num_tokens": 520538257.0, + "step": 13642 + }, + { + "epoch": 1.7355298308103295, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6155478954315186, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.870539665222168, + "num_tokens": 520576416.0, + "step": 13643 + }, + { + "epoch": 1.73565704108892, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6668378114700317, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8819122910499573, + "num_tokens": 520612554.0, + "step": 13644 + }, + { + "epoch": 1.7357842513675106, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7176604270935059, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8663104772567749, + "num_tokens": 520647797.0, + "step": 13645 + }, + { + "epoch": 1.735911461646101, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6367214918136597, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8756246566772461, + "num_tokens": 520686193.0, + "step": 13646 + }, + { + "epoch": 1.7360386719246916, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.646758794784546, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8587406277656555, + "num_tokens": 520723997.0, + "step": 13647 + }, + { + "epoch": 1.7361658822032822, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5919184684753418, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8894540071487427, + "num_tokens": 520761294.0, + "step": 13648 + }, + { + "epoch": 1.7362930924818727, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5545933246612549, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.878182053565979, + "num_tokens": 520800488.0, + "step": 13649 + }, + { + "epoch": 1.736420302760463, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5767344236373901, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8812936544418335, + "num_tokens": 520838349.0, + "step": 13650 + }, + { + "epoch": 1.7365475130390535, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.601763129234314, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8786271810531616, + "num_tokens": 520875181.0, + "step": 13651 + }, + { + "epoch": 1.736674723317644, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6398826837539673, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8584244847297668, + "num_tokens": 520914715.0, + "step": 13652 + }, + { + "epoch": 1.7368019335962346, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6612775325775146, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.866402268409729, + "num_tokens": 520954742.0, + "step": 13653 + }, + { + "epoch": 1.736929143874825, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6193774938583374, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8787950277328491, + "num_tokens": 520995327.0, + "step": 13654 + }, + { + "epoch": 1.7370563541534156, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6083967685699463, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8795875906944275, + "num_tokens": 521031919.0, + "step": 13655 + }, + { + "epoch": 1.737183564432006, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7803279161453247, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8656617999076843, + "num_tokens": 521064465.0, + "step": 13656 + }, + { + "epoch": 1.7373107747105965, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6139775514602661, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.873415470123291, + "num_tokens": 521101099.0, + "step": 13657 + }, + { + "epoch": 1.737437984989187, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6517583131790161, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8643837571144104, + "num_tokens": 521139987.0, + "step": 13658 + }, + { + "epoch": 1.7375651952677775, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6277492046356201, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8644800186157227, + "num_tokens": 521181692.0, + "step": 13659 + }, + { + "epoch": 1.737692405546368, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4788891077041626, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8773061037063599, + "num_tokens": 521225639.0, + "step": 13660 + }, + { + "epoch": 1.7378196158249586, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.506557822227478, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8755663633346558, + "num_tokens": 521264733.0, + "step": 13661 + }, + { + "epoch": 1.7379468261035491, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6075687408447266, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8781923055648804, + "num_tokens": 521299234.0, + "step": 13662 + }, + { + "epoch": 1.7380740363821396, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6086742877960205, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.874115526676178, + "num_tokens": 521337987.0, + "step": 13663 + }, + { + "epoch": 1.7382012466607302, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6321970224380493, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8781008720397949, + "num_tokens": 521377255.0, + "step": 13664 + }, + { + "epoch": 1.7383284569393207, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5805563926696777, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8608214259147644, + "num_tokens": 521417767.0, + "step": 13665 + }, + { + "epoch": 1.7384556672179112, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6268863677978516, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.872125506401062, + "num_tokens": 521453424.0, + "step": 13666 + }, + { + "epoch": 1.7385828774965018, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.607724666595459, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8714922666549683, + "num_tokens": 521492091.0, + "step": 13667 + }, + { + "epoch": 1.7387100877750923, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5475099086761475, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8809677362442017, + "num_tokens": 521531148.0, + "step": 13668 + }, + { + "epoch": 1.7388372980536828, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4953762292861938, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8686034679412842, + "num_tokens": 521578037.0, + "step": 13669 + }, + { + "epoch": 1.7389645083322733, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4382872581481934, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8692060708999634, + "num_tokens": 521623605.0, + "step": 13670 + }, + { + "epoch": 1.7390917186108639, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5807148218154907, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8600683212280273, + "num_tokens": 521671023.0, + "step": 13671 + }, + { + "epoch": 1.7392189288894544, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.587292194366455, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8677915930747986, + "num_tokens": 521711465.0, + "step": 13672 + }, + { + "epoch": 1.739346139168045, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4957481622695923, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8832254409790039, + "num_tokens": 521751675.0, + "step": 13673 + }, + { + "epoch": 1.7394733494466355, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5339845418930054, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8552032709121704, + "num_tokens": 521798442.0, + "step": 13674 + }, + { + "epoch": 1.7396005597252258, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.73091459274292, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8773468732833862, + "num_tokens": 521829073.0, + "step": 13675 + }, + { + "epoch": 1.7397277700038163, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5339659452438354, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8655862808227539, + "num_tokens": 521869254.0, + "step": 13676 + }, + { + "epoch": 1.7398549802824068, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.665210485458374, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8725299835205078, + "num_tokens": 521902220.0, + "step": 13677 + }, + { + "epoch": 1.7399821905609973, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.555657148361206, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8700611591339111, + "num_tokens": 521940269.0, + "step": 13678 + }, + { + "epoch": 1.7401094008395879, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6727694272994995, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.861841082572937, + "num_tokens": 521979874.0, + "step": 13679 + }, + { + "epoch": 1.7402366111181784, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5512768030166626, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8786327838897705, + "num_tokens": 522019309.0, + "step": 13680 + }, + { + "epoch": 1.7403638213967687, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6748862266540527, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8633513450622559, + "num_tokens": 522058483.0, + "step": 13681 + }, + { + "epoch": 1.7404910316753592, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4223257303237915, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8788208961486816, + "num_tokens": 522102603.0, + "step": 13682 + }, + { + "epoch": 1.7406182419539498, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5977399349212646, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8743090033531189, + "num_tokens": 522139057.0, + "step": 13683 + }, + { + "epoch": 1.7407454522325403, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6180058717727661, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8599134683609009, + "num_tokens": 522181365.0, + "step": 13684 + }, + { + "epoch": 1.7408726625111308, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6585297584533691, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8626607656478882, + "num_tokens": 522216599.0, + "step": 13685 + }, + { + "epoch": 1.7409998727897213, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.658017873764038, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8623636960983276, + "num_tokens": 522256229.0, + "step": 13686 + }, + { + "epoch": 1.7411270830683119, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6050375699996948, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8682665824890137, + "num_tokens": 522294459.0, + "step": 13687 + }, + { + "epoch": 1.7412542933469024, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6335526704788208, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8804170489311218, + "num_tokens": 522328831.0, + "step": 13688 + }, + { + "epoch": 1.741381503625493, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5340439081192017, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.861984372138977, + "num_tokens": 522372597.0, + "step": 13689 + }, + { + "epoch": 1.7415087139040835, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6029741764068604, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8696492910385132, + "num_tokens": 522408968.0, + "step": 13690 + }, + { + "epoch": 1.741635924182674, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5817899703979492, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8734797239303589, + "num_tokens": 522449483.0, + "step": 13691 + }, + { + "epoch": 1.7417631344612645, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5291197299957275, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8805611729621887, + "num_tokens": 522490587.0, + "step": 13692 + }, + { + "epoch": 1.741890344739855, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4885685443878174, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8683623671531677, + "num_tokens": 522533582.0, + "step": 13693 + }, + { + "epoch": 1.7420175550184456, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5950120687484741, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8695409893989563, + "num_tokens": 522570471.0, + "step": 13694 + }, + { + "epoch": 1.742144765297036, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5595788955688477, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8860284090042114, + "num_tokens": 522606486.0, + "step": 13695 + }, + { + "epoch": 1.7422719755756266, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.637442708015442, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8541268110275269, + "num_tokens": 522649114.0, + "step": 13696 + }, + { + "epoch": 1.7423991858542172, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6818835735321045, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8778533935546875, + "num_tokens": 522682027.0, + "step": 13697 + }, + { + "epoch": 1.7425263961328077, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5397014617919922, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8783180713653564, + "num_tokens": 522722201.0, + "step": 13698 + }, + { + "epoch": 1.742653606411398, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6087013483047485, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8728445768356323, + "num_tokens": 522761652.0, + "step": 13699 + }, + { + "epoch": 1.7427808166899885, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6779298782348633, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8670939207077026, + "num_tokens": 522799688.0, + "step": 13700 + }, + { + "epoch": 1.742908026968579, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6537023782730103, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8746751546859741, + "num_tokens": 522832880.0, + "step": 13701 + }, + { + "epoch": 1.7430352372471696, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.58931565284729, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8607664108276367, + "num_tokens": 522872078.0, + "step": 13702 + }, + { + "epoch": 1.74316244752576, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.524621844291687, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8794084191322327, + "num_tokens": 522911716.0, + "step": 13703 + }, + { + "epoch": 1.7432896578043506, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6625663042068481, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.872275710105896, + "num_tokens": 522951672.0, + "step": 13704 + }, + { + "epoch": 1.743416868082941, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6411974430084229, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8713253736495972, + "num_tokens": 522986790.0, + "step": 13705 + }, + { + "epoch": 1.7435440783615315, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7228623628616333, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8695082664489746, + "num_tokens": 523018407.0, + "step": 13706 + }, + { + "epoch": 1.743671288640122, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.729323148727417, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.865776538848877, + "num_tokens": 523054906.0, + "step": 13707 + }, + { + "epoch": 1.7437984989187125, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6397343873977661, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8586314916610718, + "num_tokens": 523095880.0, + "step": 13708 + }, + { + "epoch": 1.743925709197303, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.9298217296600342, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8650744557380676, + "num_tokens": 523135344.0, + "step": 13709 + }, + { + "epoch": 1.7440529194758936, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5593584775924683, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8709666728973389, + "num_tokens": 523175847.0, + "step": 13710 + }, + { + "epoch": 1.744180129754484, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5105876922607422, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8683772087097168, + "num_tokens": 523218877.0, + "step": 13711 + }, + { + "epoch": 1.7443073400330746, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.64719557762146, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8524863719940186, + "num_tokens": 523260592.0, + "step": 13712 + }, + { + "epoch": 1.7444345503116652, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6682642698287964, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8894351720809937, + "num_tokens": 523295930.0, + "step": 13713 + }, + { + "epoch": 1.7445617605902557, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5054957866668701, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8790324330329895, + "num_tokens": 523341552.0, + "step": 13714 + }, + { + "epoch": 1.7446889708688462, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6211013793945312, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8634034395217896, + "num_tokens": 523382323.0, + "step": 13715 + }, + { + "epoch": 1.7448161811474368, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5703729391098022, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8796218037605286, + "num_tokens": 523423140.0, + "step": 13716 + }, + { + "epoch": 1.7449433914260273, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6658556461334229, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8657755851745605, + "num_tokens": 523459763.0, + "step": 13717 + }, + { + "epoch": 1.7450706017046178, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7914574146270752, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8637257218360901, + "num_tokens": 523492629.0, + "step": 13718 + }, + { + "epoch": 1.7451978119832083, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6162598133087158, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8620859980583191, + "num_tokens": 523536699.0, + "step": 13719 + }, + { + "epoch": 1.7453250222617989, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7741286754608154, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8585805892944336, + "num_tokens": 523574417.0, + "step": 13720 + }, + { + "epoch": 1.7454522325403894, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5663756132125854, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8746480941772461, + "num_tokens": 523613365.0, + "step": 13721 + }, + { + "epoch": 1.74557944281898, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6388927698135376, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8700424432754517, + "num_tokens": 523650758.0, + "step": 13722 + }, + { + "epoch": 1.7457066530975704, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6095094680786133, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8840718269348145, + "num_tokens": 523685546.0, + "step": 13723 + }, + { + "epoch": 1.7458338633761608, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.541501760482788, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8786879777908325, + "num_tokens": 523723566.0, + "step": 13724 + }, + { + "epoch": 1.7459610736547513, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6699297428131104, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8738963603973389, + "num_tokens": 523758133.0, + "step": 13725 + }, + { + "epoch": 1.7460882839333418, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5339851379394531, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8763388991355896, + "num_tokens": 523796165.0, + "step": 13726 + }, + { + "epoch": 1.7462154942119323, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5213526487350464, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.882835865020752, + "num_tokens": 523836765.0, + "step": 13727 + }, + { + "epoch": 1.7463427044905229, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6059480905532837, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8679478168487549, + "num_tokens": 523875067.0, + "step": 13728 + }, + { + "epoch": 1.7464699147691134, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6624032258987427, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8570023775100708, + "num_tokens": 523913995.0, + "step": 13729 + }, + { + "epoch": 1.7465971250477037, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.420675277709961, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8844311237335205, + "num_tokens": 523956969.0, + "step": 13730 + }, + { + "epoch": 1.7467243353262942, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5802499055862427, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8686525821685791, + "num_tokens": 523994450.0, + "step": 13731 + }, + { + "epoch": 1.7468515456048848, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5873438119888306, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8686063289642334, + "num_tokens": 524036946.0, + "step": 13732 + }, + { + "epoch": 1.7469787558834753, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6490978002548218, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8742534518241882, + "num_tokens": 524071202.0, + "step": 13733 + }, + { + "epoch": 1.7471059661620658, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5031111240386963, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8673150539398193, + "num_tokens": 524113282.0, + "step": 13734 + }, + { + "epoch": 1.7472331764406563, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5055551528930664, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8837924003601074, + "num_tokens": 524150661.0, + "step": 13735 + }, + { + "epoch": 1.7473603867192469, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7684568166732788, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8839758634567261, + "num_tokens": 524181680.0, + "step": 13736 + }, + { + "epoch": 1.7474875969978374, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4528011083602905, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8697479963302612, + "num_tokens": 524225930.0, + "step": 13737 + }, + { + "epoch": 1.747614807276428, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6196032762527466, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8809454441070557, + "num_tokens": 524262463.0, + "step": 13738 + }, + { + "epoch": 1.7477420175550185, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6801669597625732, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8701226711273193, + "num_tokens": 524300399.0, + "step": 13739 + }, + { + "epoch": 1.747869227833609, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6540944576263428, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8662018775939941, + "num_tokens": 524337430.0, + "step": 13740 + }, + { + "epoch": 1.7479964381121995, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5364304780960083, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.869850218296051, + "num_tokens": 524378341.0, + "step": 13741 + }, + { + "epoch": 1.74812364839079, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6644744873046875, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8718384504318237, + "num_tokens": 524415615.0, + "step": 13742 + }, + { + "epoch": 1.7482508586693806, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5221915245056152, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8751695156097412, + "num_tokens": 524458335.0, + "step": 13743 + }, + { + "epoch": 1.748378068947971, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5301581621170044, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8779548406600952, + "num_tokens": 524499564.0, + "step": 13744 + }, + { + "epoch": 1.7485052792265616, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.429243803024292, + "learning_rate": 1e-06, + "loss": 0.308, + "mean_token_accuracy": 0.8984882235527039, + "num_tokens": 524539259.0, + "step": 13745 + }, + { + "epoch": 1.7486324895051522, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4040004014968872, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8887063264846802, + "num_tokens": 524582325.0, + "step": 13746 + }, + { + "epoch": 1.7487596997837427, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4497458934783936, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8774356842041016, + "num_tokens": 524625875.0, + "step": 13747 + }, + { + "epoch": 1.748886910062333, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4393517971038818, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8851035833358765, + "num_tokens": 524669216.0, + "step": 13748 + }, + { + "epoch": 1.7490141203409235, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5683269500732422, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.865302562713623, + "num_tokens": 524709991.0, + "step": 13749 + }, + { + "epoch": 1.749141330619514, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7462525367736816, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8592020869255066, + "num_tokens": 524747418.0, + "step": 13750 + }, + { + "epoch": 1.7492685408981046, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5062789916992188, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8753359317779541, + "num_tokens": 524787298.0, + "step": 13751 + }, + { + "epoch": 1.749395751176695, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6192089319229126, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8655900955200195, + "num_tokens": 524826222.0, + "step": 13752 + }, + { + "epoch": 1.7495229614552856, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4604501724243164, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8827313780784607, + "num_tokens": 524868067.0, + "step": 13753 + }, + { + "epoch": 1.749650171733876, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5411651134490967, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.853848934173584, + "num_tokens": 524910047.0, + "step": 13754 + }, + { + "epoch": 1.7497773820124665, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.55992591381073, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8657898902893066, + "num_tokens": 524945762.0, + "step": 13755 + }, + { + "epoch": 1.749904592291057, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5485635995864868, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.866310179233551, + "num_tokens": 524985603.0, + "step": 13756 + }, + { + "epoch": 1.7500318025696475, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.493667483329773, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.882759690284729, + "num_tokens": 525026261.0, + "step": 13757 + }, + { + "epoch": 1.750159012848238, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5933115482330322, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.852931797504425, + "num_tokens": 525069340.0, + "step": 13758 + }, + { + "epoch": 1.7502862231268286, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6147981882095337, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8786417245864868, + "num_tokens": 525105527.0, + "step": 13759 + }, + { + "epoch": 1.750413433405419, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6743329763412476, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8789093494415283, + "num_tokens": 525138070.0, + "step": 13760 + }, + { + "epoch": 1.7505406436840096, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7218735218048096, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8691434264183044, + "num_tokens": 525174030.0, + "step": 13761 + }, + { + "epoch": 1.7506678539626002, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6667983531951904, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8644709587097168, + "num_tokens": 525209758.0, + "step": 13762 + }, + { + "epoch": 1.7507950642411907, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.8457300662994385, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8704116344451904, + "num_tokens": 525237713.0, + "step": 13763 + }, + { + "epoch": 1.7509222745197812, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5406372547149658, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8610347509384155, + "num_tokens": 525283869.0, + "step": 13764 + }, + { + "epoch": 1.7510494847983717, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5981590747833252, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.870093822479248, + "num_tokens": 525327102.0, + "step": 13765 + }, + { + "epoch": 1.7511766950769623, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.8375394344329834, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8781603574752808, + "num_tokens": 525360885.0, + "step": 13766 + }, + { + "epoch": 1.7513039053555528, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.709197759628296, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8726896047592163, + "num_tokens": 525396985.0, + "step": 13767 + }, + { + "epoch": 1.7514311156341433, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.8101178407669067, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8705935478210449, + "num_tokens": 525432690.0, + "step": 13768 + }, + { + "epoch": 1.7515583259127339, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.580536961555481, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8719096779823303, + "num_tokens": 525471733.0, + "step": 13769 + }, + { + "epoch": 1.7516855361913244, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.589714527130127, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8877404928207397, + "num_tokens": 525507739.0, + "step": 13770 + }, + { + "epoch": 1.751812746469915, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.571876883506775, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.883078932762146, + "num_tokens": 525546735.0, + "step": 13771 + }, + { + "epoch": 1.7519399567485054, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5036605596542358, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8715686798095703, + "num_tokens": 525590740.0, + "step": 13772 + }, + { + "epoch": 1.7520671670270958, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6414382457733154, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8769211173057556, + "num_tokens": 525624768.0, + "step": 13773 + }, + { + "epoch": 1.7521943773056863, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5582964420318604, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8583313226699829, + "num_tokens": 525663368.0, + "step": 13774 + }, + { + "epoch": 1.7523215875842768, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6788733005523682, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8740718364715576, + "num_tokens": 525700983.0, + "step": 13775 + }, + { + "epoch": 1.7524487978628673, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6040501594543457, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8945300579071045, + "num_tokens": 525738578.0, + "step": 13776 + }, + { + "epoch": 1.7525760081414579, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6404582262039185, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8633151650428772, + "num_tokens": 525780739.0, + "step": 13777 + }, + { + "epoch": 1.7527032184200484, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6393041610717773, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8573260307312012, + "num_tokens": 525823219.0, + "step": 13778 + }, + { + "epoch": 1.7528304286986387, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5798572301864624, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8757575750350952, + "num_tokens": 525861090.0, + "step": 13779 + }, + { + "epoch": 1.7529576389772292, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5592659711837769, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8790067434310913, + "num_tokens": 525898531.0, + "step": 13780 + }, + { + "epoch": 1.7530848492558198, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.744534969329834, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8691203594207764, + "num_tokens": 525929345.0, + "step": 13781 + }, + { + "epoch": 1.7532120595344103, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5420795679092407, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8720974922180176, + "num_tokens": 525967916.0, + "step": 13782 + }, + { + "epoch": 1.7533392698130008, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.625368356704712, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8795945644378662, + "num_tokens": 526004350.0, + "step": 13783 + }, + { + "epoch": 1.7534664800915913, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7337990999221802, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8487986922264099, + "num_tokens": 526044768.0, + "step": 13784 + }, + { + "epoch": 1.7535936903701819, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6030198335647583, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8668134212493896, + "num_tokens": 526086233.0, + "step": 13785 + }, + { + "epoch": 1.7537209006487724, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.546861171722412, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8733015060424805, + "num_tokens": 526123724.0, + "step": 13786 + }, + { + "epoch": 1.753848110927363, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5503090620040894, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8711639642715454, + "num_tokens": 526162062.0, + "step": 13787 + }, + { + "epoch": 1.7539753212059535, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5109589099884033, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8676371574401855, + "num_tokens": 526201521.0, + "step": 13788 + }, + { + "epoch": 1.754102531484544, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.600009799003601, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8686773777008057, + "num_tokens": 526238371.0, + "step": 13789 + }, + { + "epoch": 1.7542297417631345, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5611655712127686, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8731799125671387, + "num_tokens": 526277904.0, + "step": 13790 + }, + { + "epoch": 1.754356952041725, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5551784038543701, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8732497692108154, + "num_tokens": 526316590.0, + "step": 13791 + }, + { + "epoch": 1.7544841623203156, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.562174677848816, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8693622350692749, + "num_tokens": 526356669.0, + "step": 13792 + }, + { + "epoch": 1.754611372598906, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.56291663646698, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8808620572090149, + "num_tokens": 526393639.0, + "step": 13793 + }, + { + "epoch": 1.7547385828774966, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6535675525665283, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8809661269187927, + "num_tokens": 526427790.0, + "step": 13794 + }, + { + "epoch": 1.7548657931560872, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5719174146652222, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8691403865814209, + "num_tokens": 526467852.0, + "step": 13795 + }, + { + "epoch": 1.7549930034346777, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.561834454536438, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8665178418159485, + "num_tokens": 526509107.0, + "step": 13796 + }, + { + "epoch": 1.755120213713268, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6039903163909912, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8620173931121826, + "num_tokens": 526551253.0, + "step": 13797 + }, + { + "epoch": 1.7552474239918585, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6962618827819824, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8651821613311768, + "num_tokens": 526589571.0, + "step": 13798 + }, + { + "epoch": 1.755374634270449, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6216074228286743, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8797227144241333, + "num_tokens": 526627670.0, + "step": 13799 + }, + { + "epoch": 1.7555018445490396, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5727070569992065, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.87022864818573, + "num_tokens": 526666461.0, + "step": 13800 + }, + { + "epoch": 1.75562905482763, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.460571050643921, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8781721591949463, + "num_tokens": 526704544.0, + "step": 13801 + }, + { + "epoch": 1.7557562651062206, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4914201498031616, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8747597932815552, + "num_tokens": 526747128.0, + "step": 13802 + }, + { + "epoch": 1.755883475384811, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6615275144577026, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8811848163604736, + "num_tokens": 526782058.0, + "step": 13803 + }, + { + "epoch": 1.7560106856634015, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.653214693069458, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.855254590511322, + "num_tokens": 526824301.0, + "step": 13804 + }, + { + "epoch": 1.756137895941992, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.583372950553894, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8622699975967407, + "num_tokens": 526862568.0, + "step": 13805 + }, + { + "epoch": 1.7562651062205825, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6004124879837036, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.858523428440094, + "num_tokens": 526903305.0, + "step": 13806 + }, + { + "epoch": 1.756392316499173, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4935671091079712, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8676331043243408, + "num_tokens": 526945604.0, + "step": 13807 + }, + { + "epoch": 1.7565195267777636, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6281486749649048, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8545069098472595, + "num_tokens": 526983979.0, + "step": 13808 + }, + { + "epoch": 1.756646737056354, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.604070782661438, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8706332445144653, + "num_tokens": 527024707.0, + "step": 13809 + }, + { + "epoch": 1.7567739473349446, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6303919553756714, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8694528937339783, + "num_tokens": 527061206.0, + "step": 13810 + }, + { + "epoch": 1.7569011576135352, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6566028594970703, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8736217021942139, + "num_tokens": 527100929.0, + "step": 13811 + }, + { + "epoch": 1.7570283678921257, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.593186378479004, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8684035539627075, + "num_tokens": 527137806.0, + "step": 13812 + }, + { + "epoch": 1.7571555781707162, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6926264762878418, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.854123592376709, + "num_tokens": 527174255.0, + "step": 13813 + }, + { + "epoch": 1.7572827884493067, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.513016700744629, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8708949089050293, + "num_tokens": 527214919.0, + "step": 13814 + }, + { + "epoch": 1.7574099987278973, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7860891819000244, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8639891147613525, + "num_tokens": 527246155.0, + "step": 13815 + }, + { + "epoch": 1.7575372090064878, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.608915090560913, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8713580369949341, + "num_tokens": 527283244.0, + "step": 13816 + }, + { + "epoch": 1.7576644192850783, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.627715826034546, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8769474029541016, + "num_tokens": 527321066.0, + "step": 13817 + }, + { + "epoch": 1.7577916295636689, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6457949876785278, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8651168346405029, + "num_tokens": 527356504.0, + "step": 13818 + }, + { + "epoch": 1.7579188398422594, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5100488662719727, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8725906014442444, + "num_tokens": 527395390.0, + "step": 13819 + }, + { + "epoch": 1.75804605012085, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6650201082229614, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8836067914962769, + "num_tokens": 527427390.0, + "step": 13820 + }, + { + "epoch": 1.7581732603994404, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.525821566581726, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8707871437072754, + "num_tokens": 527468731.0, + "step": 13821 + }, + { + "epoch": 1.7583004706780307, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5555944442749023, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8879135251045227, + "num_tokens": 527505487.0, + "step": 13822 + }, + { + "epoch": 1.7584276809566213, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4898875951766968, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.885486900806427, + "num_tokens": 527546748.0, + "step": 13823 + }, + { + "epoch": 1.7585548912352118, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.480863094329834, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8827037811279297, + "num_tokens": 527589443.0, + "step": 13824 + }, + { + "epoch": 1.7586821015138023, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5689893960952759, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8760409355163574, + "num_tokens": 527627841.0, + "step": 13825 + }, + { + "epoch": 1.7588093117923929, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6398543119430542, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8828926086425781, + "num_tokens": 527660423.0, + "step": 13826 + }, + { + "epoch": 1.7589365220709834, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 3.779406785964966, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8912177085876465, + "num_tokens": 527695164.0, + "step": 13827 + }, + { + "epoch": 1.7590637323495737, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5700366497039795, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.867914080619812, + "num_tokens": 527736875.0, + "step": 13828 + }, + { + "epoch": 1.7591909426281642, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6154991388320923, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8679106831550598, + "num_tokens": 527773174.0, + "step": 13829 + }, + { + "epoch": 1.7593181529067548, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5757163763046265, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8655301332473755, + "num_tokens": 527814824.0, + "step": 13830 + }, + { + "epoch": 1.7594453631853453, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7067224979400635, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8683193325996399, + "num_tokens": 527852706.0, + "step": 13831 + }, + { + "epoch": 1.7595725734639358, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6697081327438354, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.857255756855011, + "num_tokens": 527889567.0, + "step": 13832 + }, + { + "epoch": 1.7596997837425263, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4586634635925293, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8728960156440735, + "num_tokens": 527933152.0, + "step": 13833 + }, + { + "epoch": 1.7598269940211169, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5613014698028564, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8655737042427063, + "num_tokens": 527972547.0, + "step": 13834 + }, + { + "epoch": 1.7599542042997074, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6342123746871948, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8639112710952759, + "num_tokens": 528008783.0, + "step": 13835 + }, + { + "epoch": 1.760081414578298, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.481964349746704, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8861517906188965, + "num_tokens": 528046684.0, + "step": 13836 + }, + { + "epoch": 1.7602086248568884, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5262882709503174, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8826321363449097, + "num_tokens": 528095573.0, + "step": 13837 + }, + { + "epoch": 1.760335835135479, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6149966716766357, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8654183149337769, + "num_tokens": 528135732.0, + "step": 13838 + }, + { + "epoch": 1.7604630454140695, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5763404369354248, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8674759864807129, + "num_tokens": 528175260.0, + "step": 13839 + }, + { + "epoch": 1.76059025569266, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5925275087356567, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8572670817375183, + "num_tokens": 528218178.0, + "step": 13840 + }, + { + "epoch": 1.7607174659712506, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.658966302871704, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8603236675262451, + "num_tokens": 528256241.0, + "step": 13841 + }, + { + "epoch": 1.760844676249841, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4505077600479126, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8712546229362488, + "num_tokens": 528303384.0, + "step": 13842 + }, + { + "epoch": 1.7609718865284316, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5243548154830933, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8778325915336609, + "num_tokens": 528345225.0, + "step": 13843 + }, + { + "epoch": 1.7610990968070221, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5384503602981567, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8712531328201294, + "num_tokens": 528388101.0, + "step": 13844 + }, + { + "epoch": 1.7612263070856127, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5910848379135132, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8639450669288635, + "num_tokens": 528425977.0, + "step": 13845 + }, + { + "epoch": 1.761353517364203, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7038367986679077, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.873102068901062, + "num_tokens": 528459264.0, + "step": 13846 + }, + { + "epoch": 1.7614807276427935, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.742950439453125, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8605284094810486, + "num_tokens": 528492347.0, + "step": 13847 + }, + { + "epoch": 1.761607937921384, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.741123080253601, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8651851415634155, + "num_tokens": 528528006.0, + "step": 13848 + }, + { + "epoch": 1.7617351481999746, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5549293756484985, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8697073459625244, + "num_tokens": 528566396.0, + "step": 13849 + }, + { + "epoch": 1.761862358478565, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5151852369308472, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8872003555297852, + "num_tokens": 528607876.0, + "step": 13850 + }, + { + "epoch": 1.7619895687571556, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.73172128200531, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.874129056930542, + "num_tokens": 528639711.0, + "step": 13851 + }, + { + "epoch": 1.762116779035746, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5750330686569214, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8704464435577393, + "num_tokens": 528676202.0, + "step": 13852 + }, + { + "epoch": 1.7622439893143365, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 3.73539662361145, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.8894590735435486, + "num_tokens": 528705532.0, + "step": 13853 + }, + { + "epoch": 1.762371199592927, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6663246154785156, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8588655591011047, + "num_tokens": 528742586.0, + "step": 13854 + }, + { + "epoch": 1.7624984098715175, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6186772584915161, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8795300722122192, + "num_tokens": 528784653.0, + "step": 13855 + }, + { + "epoch": 1.762625620150108, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6771368980407715, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.853252649307251, + "num_tokens": 528822160.0, + "step": 13856 + }, + { + "epoch": 1.7627528304286986, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.486838698387146, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8755642175674438, + "num_tokens": 528863587.0, + "step": 13857 + }, + { + "epoch": 1.762880040707289, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.484612226486206, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8639957308769226, + "num_tokens": 528903780.0, + "step": 13858 + }, + { + "epoch": 1.7630072509858796, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7805445194244385, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8675745725631714, + "num_tokens": 528935625.0, + "step": 13859 + }, + { + "epoch": 1.7631344612644702, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6686826944351196, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8649889826774597, + "num_tokens": 528970286.0, + "step": 13860 + }, + { + "epoch": 1.7632616715430607, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4725112915039062, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.881717324256897, + "num_tokens": 529012579.0, + "step": 13861 + }, + { + "epoch": 1.7633888818216512, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.567307710647583, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8791443705558777, + "num_tokens": 529047482.0, + "step": 13862 + }, + { + "epoch": 1.7635160921002417, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5802946090698242, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8856440782546997, + "num_tokens": 529082583.0, + "step": 13863 + }, + { + "epoch": 1.7636433023788323, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8011842966079712, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8546201586723328, + "num_tokens": 529119335.0, + "step": 13864 + }, + { + "epoch": 1.7637705126574228, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7079602479934692, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8605217933654785, + "num_tokens": 529158760.0, + "step": 13865 + }, + { + "epoch": 1.7638977229360133, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5072767734527588, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8836722373962402, + "num_tokens": 529198302.0, + "step": 13866 + }, + { + "epoch": 1.7640249332146039, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6894526481628418, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8554735779762268, + "num_tokens": 529234724.0, + "step": 13867 + }, + { + "epoch": 1.7641521434931944, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5500434637069702, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8808192014694214, + "num_tokens": 529273332.0, + "step": 13868 + }, + { + "epoch": 1.764279353771785, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5041475296020508, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8593616485595703, + "num_tokens": 529318540.0, + "step": 13869 + }, + { + "epoch": 1.7644065640503754, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6208932399749756, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8818984031677246, + "num_tokens": 529354455.0, + "step": 13870 + }, + { + "epoch": 1.7645337743289657, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.520542025566101, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8898352384567261, + "num_tokens": 529391779.0, + "step": 13871 + }, + { + "epoch": 1.7646609846075563, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5065925121307373, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.867103636264801, + "num_tokens": 529433152.0, + "step": 13872 + }, + { + "epoch": 1.7647881948861468, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5536104440689087, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8798362016677856, + "num_tokens": 529471013.0, + "step": 13873 + }, + { + "epoch": 1.7649154051647373, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5334527492523193, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8730136156082153, + "num_tokens": 529512314.0, + "step": 13874 + }, + { + "epoch": 1.7650426154433279, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7145464420318604, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8595035076141357, + "num_tokens": 529552928.0, + "step": 13875 + }, + { + "epoch": 1.7651698257219184, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.682508945465088, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8612741827964783, + "num_tokens": 529591348.0, + "step": 13876 + }, + { + "epoch": 1.7652970360005087, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5470576286315918, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8775318264961243, + "num_tokens": 529628241.0, + "step": 13877 + }, + { + "epoch": 1.7654242462790992, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5131146907806396, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8728078603744507, + "num_tokens": 529672245.0, + "step": 13878 + }, + { + "epoch": 1.7655514565576897, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4904378652572632, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.875253438949585, + "num_tokens": 529714483.0, + "step": 13879 + }, + { + "epoch": 1.7656786668362803, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6188730001449585, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8767569065093994, + "num_tokens": 529749652.0, + "step": 13880 + }, + { + "epoch": 1.7658058771148708, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6896001100540161, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8715996742248535, + "num_tokens": 529786670.0, + "step": 13881 + }, + { + "epoch": 1.7659330873934613, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.570650577545166, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8709750771522522, + "num_tokens": 529826372.0, + "step": 13882 + }, + { + "epoch": 1.7660602976720519, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5045926570892334, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8738141059875488, + "num_tokens": 529869143.0, + "step": 13883 + }, + { + "epoch": 1.7661875079506424, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.587918758392334, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8707951307296753, + "num_tokens": 529910087.0, + "step": 13884 + }, + { + "epoch": 1.766314718229233, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.547559380531311, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8672193288803101, + "num_tokens": 529947763.0, + "step": 13885 + }, + { + "epoch": 1.7664419285078234, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5482875108718872, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8719337582588196, + "num_tokens": 529984567.0, + "step": 13886 + }, + { + "epoch": 1.766569138786414, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7372775077819824, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.865487277507782, + "num_tokens": 530021233.0, + "step": 13887 + }, + { + "epoch": 1.7666963490650045, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6859322786331177, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8572636842727661, + "num_tokens": 530060876.0, + "step": 13888 + }, + { + "epoch": 1.766823559343595, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4658790826797485, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8692886829376221, + "num_tokens": 530105695.0, + "step": 13889 + }, + { + "epoch": 1.7669507696221856, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5403975248336792, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8777971267700195, + "num_tokens": 530146123.0, + "step": 13890 + }, + { + "epoch": 1.767077979900776, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.481521487236023, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8749845027923584, + "num_tokens": 530186452.0, + "step": 13891 + }, + { + "epoch": 1.7672051901793666, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.593127965927124, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8676052093505859, + "num_tokens": 530224594.0, + "step": 13892 + }, + { + "epoch": 1.7673324004579571, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.521845817565918, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8661098480224609, + "num_tokens": 530263862.0, + "step": 13893 + }, + { + "epoch": 1.7674596107365477, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5809075832366943, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8579930663108826, + "num_tokens": 530304431.0, + "step": 13894 + }, + { + "epoch": 1.767586821015138, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5937492847442627, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8744818568229675, + "num_tokens": 530340686.0, + "step": 13895 + }, + { + "epoch": 1.7677140312937285, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.561594843864441, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8804304599761963, + "num_tokens": 530376326.0, + "step": 13896 + }, + { + "epoch": 1.767841241572319, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4399099349975586, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8777607679367065, + "num_tokens": 530420570.0, + "step": 13897 + }, + { + "epoch": 1.7679684518509096, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 3.7004878520965576, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8693233728408813, + "num_tokens": 530454440.0, + "step": 13898 + }, + { + "epoch": 1.7680956621295, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5432487726211548, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8710464239120483, + "num_tokens": 530494411.0, + "step": 13899 + }, + { + "epoch": 1.7682228724080906, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.737943410873413, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8490476608276367, + "num_tokens": 530528451.0, + "step": 13900 + }, + { + "epoch": 1.768350082686681, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5329654216766357, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8719936609268188, + "num_tokens": 530570232.0, + "step": 13901 + }, + { + "epoch": 1.7684772929652715, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5498076677322388, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8684877157211304, + "num_tokens": 530607906.0, + "step": 13902 + }, + { + "epoch": 1.768604503243862, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6084394454956055, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8631837368011475, + "num_tokens": 530647373.0, + "step": 13903 + }, + { + "epoch": 1.7687317135224525, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6444889307022095, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8653731346130371, + "num_tokens": 530683319.0, + "step": 13904 + }, + { + "epoch": 1.768858923801043, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6306555271148682, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8491464853286743, + "num_tokens": 530724528.0, + "step": 13905 + }, + { + "epoch": 1.7689861340796336, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.712080478668213, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8698415160179138, + "num_tokens": 530760525.0, + "step": 13906 + }, + { + "epoch": 1.769113344358224, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6384820938110352, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8740209341049194, + "num_tokens": 530794698.0, + "step": 13907 + }, + { + "epoch": 1.7692405546368146, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.944858431816101, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8709728717803955, + "num_tokens": 530827098.0, + "step": 13908 + }, + { + "epoch": 1.7693677649154052, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.7179721593856812, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8579549789428711, + "num_tokens": 530860636.0, + "step": 13909 + }, + { + "epoch": 1.7694949751939957, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.4395861625671387, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8771631121635437, + "num_tokens": 530903173.0, + "step": 13910 + }, + { + "epoch": 1.7696221854725862, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6180667877197266, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8678192496299744, + "num_tokens": 530940450.0, + "step": 13911 + }, + { + "epoch": 1.7697493957511767, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.486608624458313, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8728529214859009, + "num_tokens": 530981832.0, + "step": 13912 + }, + { + "epoch": 1.7698766060297673, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5209921598434448, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8678951859474182, + "num_tokens": 531025372.0, + "step": 13913 + }, + { + "epoch": 1.7700038163083578, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6417860984802246, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8716817498207092, + "num_tokens": 531062170.0, + "step": 13914 + }, + { + "epoch": 1.7701310265869483, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5644770860671997, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.879988431930542, + "num_tokens": 531102504.0, + "step": 13915 + }, + { + "epoch": 1.7702582368655388, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.641476035118103, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8646855354309082, + "num_tokens": 531138265.0, + "step": 13916 + }, + { + "epoch": 1.7703854471441294, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6562658548355103, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8685388565063477, + "num_tokens": 531173796.0, + "step": 13917 + }, + { + "epoch": 1.77051265742272, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5975216627120972, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8719244003295898, + "num_tokens": 531213781.0, + "step": 13918 + }, + { + "epoch": 1.7706398677013102, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6697120666503906, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8810762166976929, + "num_tokens": 531245637.0, + "step": 13919 + }, + { + "epoch": 1.7707670779799007, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7105693817138672, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8557044267654419, + "num_tokens": 531281699.0, + "step": 13920 + }, + { + "epoch": 1.7708942882584913, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4880354404449463, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8657997846603394, + "num_tokens": 531325451.0, + "step": 13921 + }, + { + "epoch": 1.7710214985370818, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6902996301651, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8595119714736938, + "num_tokens": 531362478.0, + "step": 13922 + }, + { + "epoch": 1.7711487088156723, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6881998777389526, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8766229152679443, + "num_tokens": 531397854.0, + "step": 13923 + }, + { + "epoch": 1.7712759190942629, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.706133246421814, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.873995304107666, + "num_tokens": 531432281.0, + "step": 13924 + }, + { + "epoch": 1.7714031293728534, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.591905951499939, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8650557994842529, + "num_tokens": 531470842.0, + "step": 13925 + }, + { + "epoch": 1.7715303396514437, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7346532344818115, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8505723476409912, + "num_tokens": 531505265.0, + "step": 13926 + }, + { + "epoch": 1.7716575499300342, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5809507369995117, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8676072359085083, + "num_tokens": 531544046.0, + "step": 13927 + }, + { + "epoch": 1.7717847602086247, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5491687059402466, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8767460584640503, + "num_tokens": 531585353.0, + "step": 13928 + }, + { + "epoch": 1.7719119704872153, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7378078699111938, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8759225010871887, + "num_tokens": 531620184.0, + "step": 13929 + }, + { + "epoch": 1.7720391807658058, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5590111017227173, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8642771244049072, + "num_tokens": 531660926.0, + "step": 13930 + }, + { + "epoch": 1.7721663910443963, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5519335269927979, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8559533357620239, + "num_tokens": 531706895.0, + "step": 13931 + }, + { + "epoch": 1.7722936013229869, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6329073905944824, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8870179653167725, + "num_tokens": 531738603.0, + "step": 13932 + }, + { + "epoch": 1.7724208116015774, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6580795049667358, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8768852353096008, + "num_tokens": 531776103.0, + "step": 13933 + }, + { + "epoch": 1.772548021880168, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5331110954284668, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8723751902580261, + "num_tokens": 531818996.0, + "step": 13934 + }, + { + "epoch": 1.7726752321587584, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5992622375488281, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8776146769523621, + "num_tokens": 531855318.0, + "step": 13935 + }, + { + "epoch": 1.772802442437349, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6042087078094482, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8613795042037964, + "num_tokens": 531897879.0, + "step": 13936 + }, + { + "epoch": 1.7729296527159395, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6513923406600952, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.875099778175354, + "num_tokens": 531934815.0, + "step": 13937 + }, + { + "epoch": 1.77305686299453, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6402267217636108, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8729650974273682, + "num_tokens": 531969480.0, + "step": 13938 + }, + { + "epoch": 1.7731840732731206, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5775848627090454, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8752457499504089, + "num_tokens": 532009406.0, + "step": 13939 + }, + { + "epoch": 1.773311283551711, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6416844129562378, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8886710405349731, + "num_tokens": 532044793.0, + "step": 13940 + }, + { + "epoch": 1.7734384938303016, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8106368780136108, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8452461957931519, + "num_tokens": 532083456.0, + "step": 13941 + }, + { + "epoch": 1.7735657041088921, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.658067226409912, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.86077481508255, + "num_tokens": 532124773.0, + "step": 13942 + }, + { + "epoch": 1.7736929143874827, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.713022232055664, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8815137147903442, + "num_tokens": 532156482.0, + "step": 13943 + }, + { + "epoch": 1.773820124666073, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6925365924835205, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8785338997840881, + "num_tokens": 532191413.0, + "step": 13944 + }, + { + "epoch": 1.7739473349446635, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6483274698257446, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8787210583686829, + "num_tokens": 532227242.0, + "step": 13945 + }, + { + "epoch": 1.774074545223254, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.556944727897644, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8749314546585083, + "num_tokens": 532262822.0, + "step": 13946 + }, + { + "epoch": 1.7742017555018446, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5089809894561768, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8881603479385376, + "num_tokens": 532302993.0, + "step": 13947 + }, + { + "epoch": 1.774328965780435, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6417946815490723, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8730741739273071, + "num_tokens": 532338137.0, + "step": 13948 + }, + { + "epoch": 1.7744561760590256, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7856707572937012, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8648667335510254, + "num_tokens": 532374103.0, + "step": 13949 + }, + { + "epoch": 1.774583386337616, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.66116464138031, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8697853684425354, + "num_tokens": 532411305.0, + "step": 13950 + }, + { + "epoch": 1.7747105966162064, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6868253946304321, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8741694092750549, + "num_tokens": 532443909.0, + "step": 13951 + }, + { + "epoch": 1.774837806894797, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7027292251586914, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8778190016746521, + "num_tokens": 532475327.0, + "step": 13952 + }, + { + "epoch": 1.7749650171733875, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5160105228424072, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8763517141342163, + "num_tokens": 532516514.0, + "step": 13953 + }, + { + "epoch": 1.775092227451978, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5385764837265015, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8731337785720825, + "num_tokens": 532558875.0, + "step": 13954 + }, + { + "epoch": 1.7752194377305686, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5600965023040771, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8572160005569458, + "num_tokens": 532601619.0, + "step": 13955 + }, + { + "epoch": 1.775346648009159, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6323400735855103, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8717226982116699, + "num_tokens": 532634821.0, + "step": 13956 + }, + { + "epoch": 1.7754738582877496, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6251919269561768, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8671008348464966, + "num_tokens": 532670927.0, + "step": 13957 + }, + { + "epoch": 1.7756010685663401, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5613422393798828, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8589428663253784, + "num_tokens": 532714159.0, + "step": 13958 + }, + { + "epoch": 1.7757282788449307, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6077862977981567, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8694047927856445, + "num_tokens": 532749708.0, + "step": 13959 + }, + { + "epoch": 1.7758554891235212, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6298880577087402, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8817782998085022, + "num_tokens": 532790865.0, + "step": 13960 + }, + { + "epoch": 1.7759826994021117, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5631505250930786, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8706340789794922, + "num_tokens": 532827709.0, + "step": 13961 + }, + { + "epoch": 1.7761099096807023, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5592460632324219, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8613628149032593, + "num_tokens": 532867915.0, + "step": 13962 + }, + { + "epoch": 1.7762371199592928, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.5002549886703491, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8669159412384033, + "num_tokens": 532910634.0, + "step": 13963 + }, + { + "epoch": 1.7763643302378833, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5717790126800537, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8678470849990845, + "num_tokens": 532949666.0, + "step": 13964 + }, + { + "epoch": 1.7764915405164738, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4801257848739624, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8737032413482666, + "num_tokens": 532992020.0, + "step": 13965 + }, + { + "epoch": 1.7766187507950644, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6247313022613525, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8634200096130371, + "num_tokens": 533028029.0, + "step": 13966 + }, + { + "epoch": 1.776745961073655, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.615594744682312, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.867550253868103, + "num_tokens": 533070375.0, + "step": 13967 + }, + { + "epoch": 1.7768731713522452, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.515601634979248, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8777403831481934, + "num_tokens": 533113160.0, + "step": 13968 + }, + { + "epoch": 1.7770003816308357, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.718939185142517, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8439121246337891, + "num_tokens": 533152715.0, + "step": 13969 + }, + { + "epoch": 1.7771275919094263, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6647778749465942, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8683331608772278, + "num_tokens": 533188048.0, + "step": 13970 + }, + { + "epoch": 1.7772548021880168, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.6566826105117798, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8595902919769287, + "num_tokens": 533226803.0, + "step": 13971 + }, + { + "epoch": 1.7773820124666073, + "ewc_loss": 2.4437904357910156e-05, + "grad_norm": 1.707029938697815, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8668988347053528, + "num_tokens": 533263469.0, + "step": 13972 + }, + { + "epoch": 1.7775092227451978, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6920770406723022, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8611158728599548, + "num_tokens": 533301432.0, + "step": 13973 + }, + { + "epoch": 1.7776364330237884, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.727738857269287, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.877445638179779, + "num_tokens": 533335130.0, + "step": 13974 + }, + { + "epoch": 1.7777636433023787, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4564507007598877, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8851404786109924, + "num_tokens": 533374815.0, + "step": 13975 + }, + { + "epoch": 1.7778908535809692, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6073561906814575, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.879463791847229, + "num_tokens": 533410518.0, + "step": 13976 + }, + { + "epoch": 1.7780180638595597, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.721581220626831, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8712956309318542, + "num_tokens": 533445719.0, + "step": 13977 + }, + { + "epoch": 1.7781452741381503, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.649112343788147, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8662488460540771, + "num_tokens": 533485705.0, + "step": 13978 + }, + { + "epoch": 1.7782724844167408, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5783668756484985, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8835437893867493, + "num_tokens": 533521228.0, + "step": 13979 + }, + { + "epoch": 1.7783996946953313, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6282838582992554, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8689661026000977, + "num_tokens": 533558233.0, + "step": 13980 + }, + { + "epoch": 1.7785269049739219, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7276333570480347, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8655052781105042, + "num_tokens": 533595966.0, + "step": 13981 + }, + { + "epoch": 1.7786541152525124, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6263985633850098, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8695865869522095, + "num_tokens": 533629227.0, + "step": 13982 + }, + { + "epoch": 1.778781325531103, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.574950933456421, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8780651092529297, + "num_tokens": 533664902.0, + "step": 13983 + }, + { + "epoch": 1.7789085358096934, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5297797918319702, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8810145854949951, + "num_tokens": 533702873.0, + "step": 13984 + }, + { + "epoch": 1.779035746088284, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6428824663162231, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8599794507026672, + "num_tokens": 533737295.0, + "step": 13985 + }, + { + "epoch": 1.7791629563668745, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.704217791557312, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8600319623947144, + "num_tokens": 533774180.0, + "step": 13986 + }, + { + "epoch": 1.779290166645465, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.645566701889038, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8607660531997681, + "num_tokens": 533811179.0, + "step": 13987 + }, + { + "epoch": 1.7794173769240555, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5905994176864624, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.868827223777771, + "num_tokens": 533848892.0, + "step": 13988 + }, + { + "epoch": 1.779544587202646, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6573407649993896, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8864694833755493, + "num_tokens": 533883907.0, + "step": 13989 + }, + { + "epoch": 1.7796717974812366, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6771599054336548, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8707465529441833, + "num_tokens": 533921550.0, + "step": 13990 + }, + { + "epoch": 1.7797990077598271, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5879993438720703, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8680224418640137, + "num_tokens": 533961319.0, + "step": 13991 + }, + { + "epoch": 1.7799262180384177, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.632928490638733, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.875185489654541, + "num_tokens": 533997953.0, + "step": 13992 + }, + { + "epoch": 1.780053428317008, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6544756889343262, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.870647668838501, + "num_tokens": 534035019.0, + "step": 13993 + }, + { + "epoch": 1.7801806385955985, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7321465015411377, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8610717058181763, + "num_tokens": 534073011.0, + "step": 13994 + }, + { + "epoch": 1.780307848874189, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.873001217842102, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8483964204788208, + "num_tokens": 534105849.0, + "step": 13995 + }, + { + "epoch": 1.7804350591527796, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.55740225315094, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8608057498931885, + "num_tokens": 534149647.0, + "step": 13996 + }, + { + "epoch": 1.78056226943137, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7727155685424805, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8695899248123169, + "num_tokens": 534183901.0, + "step": 13997 + }, + { + "epoch": 1.7806894797099606, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5443520545959473, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8730781674385071, + "num_tokens": 534221084.0, + "step": 13998 + }, + { + "epoch": 1.780816689988551, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6279364824295044, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.861896276473999, + "num_tokens": 534259071.0, + "step": 13999 + }, + { + "epoch": 1.7809439002671414, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5100326538085938, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8687018156051636, + "num_tokens": 534300964.0, + "step": 14000 + }, + { + "epoch": 1.781071110545732, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5399296283721924, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.881158709526062, + "num_tokens": 534341312.0, + "step": 14001 + }, + { + "epoch": 1.7811983208243225, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.559268593788147, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8846509456634521, + "num_tokens": 534380746.0, + "step": 14002 + }, + { + "epoch": 1.781325531102913, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5330748558044434, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8596004843711853, + "num_tokens": 534422193.0, + "step": 14003 + }, + { + "epoch": 1.7814527413815036, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6656858921051025, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8806105852127075, + "num_tokens": 534459899.0, + "step": 14004 + }, + { + "epoch": 1.781579951660094, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.589808464050293, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8796379566192627, + "num_tokens": 534498118.0, + "step": 14005 + }, + { + "epoch": 1.7817071619386846, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6361448764801025, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8680459260940552, + "num_tokens": 534538547.0, + "step": 14006 + }, + { + "epoch": 1.7818343722172751, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5590180158615112, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8729944825172424, + "num_tokens": 534577929.0, + "step": 14007 + }, + { + "epoch": 1.7819615824958657, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8486539125442505, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8478401899337769, + "num_tokens": 534616166.0, + "step": 14008 + }, + { + "epoch": 1.7820887927744562, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6298813819885254, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8716592192649841, + "num_tokens": 534653404.0, + "step": 14009 + }, + { + "epoch": 1.7822160030530467, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5667171478271484, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8591882586479187, + "num_tokens": 534696907.0, + "step": 14010 + }, + { + "epoch": 1.7823432133316373, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6060338020324707, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8655052185058594, + "num_tokens": 534733742.0, + "step": 14011 + }, + { + "epoch": 1.7824704236102278, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6720271110534668, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8602917194366455, + "num_tokens": 534771065.0, + "step": 14012 + }, + { + "epoch": 1.7825976338888183, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.511229157447815, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8855146765708923, + "num_tokens": 534807401.0, + "step": 14013 + }, + { + "epoch": 1.7827248441674088, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.590158462524414, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8689408898353577, + "num_tokens": 534849885.0, + "step": 14014 + }, + { + "epoch": 1.7828520544459994, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5689789056777954, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8828462362289429, + "num_tokens": 534889413.0, + "step": 14015 + }, + { + "epoch": 1.78297926472459, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.557624101638794, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8642569780349731, + "num_tokens": 534929767.0, + "step": 14016 + }, + { + "epoch": 1.7831064750031802, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5736030340194702, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.86726975440979, + "num_tokens": 534970839.0, + "step": 14017 + }, + { + "epoch": 1.7832336852817707, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6907318830490112, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8833634257316589, + "num_tokens": 535009390.0, + "step": 14018 + }, + { + "epoch": 1.7833608955603613, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.751779317855835, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8743981122970581, + "num_tokens": 535041462.0, + "step": 14019 + }, + { + "epoch": 1.7834881058389518, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5913845300674438, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.866460919380188, + "num_tokens": 535080436.0, + "step": 14020 + }, + { + "epoch": 1.7836153161175423, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6552352905273438, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8709964752197266, + "num_tokens": 535117484.0, + "step": 14021 + }, + { + "epoch": 1.7837425263961328, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.629944086074829, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8788323998451233, + "num_tokens": 535155218.0, + "step": 14022 + }, + { + "epoch": 1.7838697366747234, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6368123292922974, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8723713755607605, + "num_tokens": 535188893.0, + "step": 14023 + }, + { + "epoch": 1.7839969469533137, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.622089147567749, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8604166507720947, + "num_tokens": 535226794.0, + "step": 14024 + }, + { + "epoch": 1.7841241572319042, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5785001516342163, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8705677390098572, + "num_tokens": 535264300.0, + "step": 14025 + }, + { + "epoch": 1.7842513675104947, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7326277494430542, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8679404854774475, + "num_tokens": 535297763.0, + "step": 14026 + }, + { + "epoch": 1.7843785777890853, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6232668161392212, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8643181324005127, + "num_tokens": 535336459.0, + "step": 14027 + }, + { + "epoch": 1.7845057880676758, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.57675039768219, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8571420907974243, + "num_tokens": 535376731.0, + "step": 14028 + }, + { + "epoch": 1.7846329983462663, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5218474864959717, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8848591446876526, + "num_tokens": 535417630.0, + "step": 14029 + }, + { + "epoch": 1.7847602086248568, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5504052639007568, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8861746191978455, + "num_tokens": 535456862.0, + "step": 14030 + }, + { + "epoch": 1.7848874189034474, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5928775072097778, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8678809404373169, + "num_tokens": 535496051.0, + "step": 14031 + }, + { + "epoch": 1.785014629182038, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5794188976287842, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8634521961212158, + "num_tokens": 535537359.0, + "step": 14032 + }, + { + "epoch": 1.7851418394606284, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6669865846633911, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8423622846603394, + "num_tokens": 535578304.0, + "step": 14033 + }, + { + "epoch": 1.785269049739219, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7318586111068726, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8939734697341919, + "num_tokens": 535610090.0, + "step": 14034 + }, + { + "epoch": 1.7853962600178095, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.555835485458374, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8559858202934265, + "num_tokens": 535654945.0, + "step": 14035 + }, + { + "epoch": 1.7855234702964, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.551543116569519, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8749057054519653, + "num_tokens": 535697752.0, + "step": 14036 + }, + { + "epoch": 1.7856506805749905, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5400161743164062, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8645212650299072, + "num_tokens": 535740903.0, + "step": 14037 + }, + { + "epoch": 1.785777890853581, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.836504578590393, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8704853653907776, + "num_tokens": 535773415.0, + "step": 14038 + }, + { + "epoch": 1.7859051011321716, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6429550647735596, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8774441480636597, + "num_tokens": 535807759.0, + "step": 14039 + }, + { + "epoch": 1.7860323114107621, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6432671546936035, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8691856265068054, + "num_tokens": 535845832.0, + "step": 14040 + }, + { + "epoch": 1.7861595216893527, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 2.5839626789093018, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8801907300949097, + "num_tokens": 535886414.0, + "step": 14041 + }, + { + "epoch": 1.786286731967943, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8028838634490967, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8535928726196289, + "num_tokens": 535924920.0, + "step": 14042 + }, + { + "epoch": 1.7864139422465335, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.668870210647583, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8766099214553833, + "num_tokens": 535958095.0, + "step": 14043 + }, + { + "epoch": 1.786541152525124, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7688452005386353, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8645498752593994, + "num_tokens": 535993640.0, + "step": 14044 + }, + { + "epoch": 1.7866683628037145, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5334497690200806, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8587254285812378, + "num_tokens": 536037728.0, + "step": 14045 + }, + { + "epoch": 1.786795573082305, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.738803505897522, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8643018007278442, + "num_tokens": 536069768.0, + "step": 14046 + }, + { + "epoch": 1.7869227833608956, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7038770914077759, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8778992891311646, + "num_tokens": 536102599.0, + "step": 14047 + }, + { + "epoch": 1.787049993639486, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4662377834320068, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8703509569168091, + "num_tokens": 536142599.0, + "step": 14048 + }, + { + "epoch": 1.7871772039180764, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5401276350021362, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8838285803794861, + "num_tokens": 536181054.0, + "step": 14049 + }, + { + "epoch": 1.787304414196667, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6866058111190796, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8694185614585876, + "num_tokens": 536218146.0, + "step": 14050 + }, + { + "epoch": 1.7874316244752575, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6488926410675049, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8672609329223633, + "num_tokens": 536254911.0, + "step": 14051 + }, + { + "epoch": 1.787558834753848, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5684154033660889, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.881614089012146, + "num_tokens": 536290663.0, + "step": 14052 + }, + { + "epoch": 1.7876860450324386, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8158406019210815, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8683672547340393, + "num_tokens": 536320198.0, + "step": 14053 + }, + { + "epoch": 1.787813255311029, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7545033693313599, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8758035898208618, + "num_tokens": 536356770.0, + "step": 14054 + }, + { + "epoch": 1.7879404655896196, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5919886827468872, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8651238679885864, + "num_tokens": 536394868.0, + "step": 14055 + }, + { + "epoch": 1.7880676758682101, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5465070009231567, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8691356182098389, + "num_tokens": 536433609.0, + "step": 14056 + }, + { + "epoch": 1.7881948861468007, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5705695152282715, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8759467601776123, + "num_tokens": 536470941.0, + "step": 14057 + }, + { + "epoch": 1.7883220964253912, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6110056638717651, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8623452186584473, + "num_tokens": 536512156.0, + "step": 14058 + }, + { + "epoch": 1.7884493067039817, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4678056240081787, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8779193162918091, + "num_tokens": 536550374.0, + "step": 14059 + }, + { + "epoch": 1.7885765169825723, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7034305334091187, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8647107481956482, + "num_tokens": 536592894.0, + "step": 14060 + }, + { + "epoch": 1.7887037272611628, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4919947385787964, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.877020537853241, + "num_tokens": 536632588.0, + "step": 14061 + }, + { + "epoch": 1.7888309375397533, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5676734447479248, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8669278621673584, + "num_tokens": 536670624.0, + "step": 14062 + }, + { + "epoch": 1.7889581478183438, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5002410411834717, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8822834491729736, + "num_tokens": 536711865.0, + "step": 14063 + }, + { + "epoch": 1.7890853580969344, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5795493125915527, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8621599078178406, + "num_tokens": 536751666.0, + "step": 14064 + }, + { + "epoch": 1.789212568375525, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5204010009765625, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8835349678993225, + "num_tokens": 536790852.0, + "step": 14065 + }, + { + "epoch": 1.7893397786541152, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5245966911315918, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8751932382583618, + "num_tokens": 536830594.0, + "step": 14066 + }, + { + "epoch": 1.7894669889327057, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5712270736694336, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8821569085121155, + "num_tokens": 536866286.0, + "step": 14067 + }, + { + "epoch": 1.7895941992112963, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5695629119873047, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8656828999519348, + "num_tokens": 536908035.0, + "step": 14068 + }, + { + "epoch": 1.7897214094898868, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5977016687393188, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8845632076263428, + "num_tokens": 536942897.0, + "step": 14069 + }, + { + "epoch": 1.7898486197684773, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7738815546035767, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8583618402481079, + "num_tokens": 536978358.0, + "step": 14070 + }, + { + "epoch": 1.7899758300470678, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7031879425048828, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8830119371414185, + "num_tokens": 537009098.0, + "step": 14071 + }, + { + "epoch": 1.7901030403256584, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4846265316009521, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8734815120697021, + "num_tokens": 537057050.0, + "step": 14072 + }, + { + "epoch": 1.7902302506042487, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4939563274383545, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8842312097549438, + "num_tokens": 537094422.0, + "step": 14073 + }, + { + "epoch": 1.7903574608828392, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6512962579727173, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8747612237930298, + "num_tokens": 537129452.0, + "step": 14074 + }, + { + "epoch": 1.7904846711614297, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5997884273529053, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8775138854980469, + "num_tokens": 537166728.0, + "step": 14075 + }, + { + "epoch": 1.7906118814400203, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8061083555221558, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8655573129653931, + "num_tokens": 537197293.0, + "step": 14076 + }, + { + "epoch": 1.7907390917186108, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4576566219329834, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8788592219352722, + "num_tokens": 537239915.0, + "step": 14077 + }, + { + "epoch": 1.7908663019972013, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.577977180480957, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8796757459640503, + "num_tokens": 537276114.0, + "step": 14078 + }, + { + "epoch": 1.7909935122757918, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4359736442565918, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8742353916168213, + "num_tokens": 537319447.0, + "step": 14079 + }, + { + "epoch": 1.7911207225543824, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5321087837219238, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8610314726829529, + "num_tokens": 537357552.0, + "step": 14080 + }, + { + "epoch": 1.791247932832973, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6329549551010132, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.866464376449585, + "num_tokens": 537394650.0, + "step": 14081 + }, + { + "epoch": 1.7913751431115634, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5220999717712402, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8706676959991455, + "num_tokens": 537435504.0, + "step": 14082 + }, + { + "epoch": 1.791502353390154, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.511631965637207, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8752118349075317, + "num_tokens": 537475737.0, + "step": 14083 + }, + { + "epoch": 1.7916295636687445, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4337199926376343, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8783842921257019, + "num_tokens": 537518590.0, + "step": 14084 + }, + { + "epoch": 1.791756773947335, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6217596530914307, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.865261435508728, + "num_tokens": 537556613.0, + "step": 14085 + }, + { + "epoch": 1.7918839842259255, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.642246127128601, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.880436360836029, + "num_tokens": 537591325.0, + "step": 14086 + }, + { + "epoch": 1.792011194504516, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6614187955856323, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8577827215194702, + "num_tokens": 537629642.0, + "step": 14087 + }, + { + "epoch": 1.7921384047831066, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.632526159286499, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8716473579406738, + "num_tokens": 537666756.0, + "step": 14088 + }, + { + "epoch": 1.7922656150616971, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6072404384613037, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8776438236236572, + "num_tokens": 537703463.0, + "step": 14089 + }, + { + "epoch": 1.7923928253402877, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.444270372390747, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8886145353317261, + "num_tokens": 537744818.0, + "step": 14090 + }, + { + "epoch": 1.792520035618878, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.655652403831482, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8808832168579102, + "num_tokens": 537781887.0, + "step": 14091 + }, + { + "epoch": 1.7926472458974685, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6708649396896362, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.877144992351532, + "num_tokens": 537817996.0, + "step": 14092 + }, + { + "epoch": 1.792774456176059, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4853724241256714, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8616958856582642, + "num_tokens": 537864504.0, + "step": 14093 + }, + { + "epoch": 1.7929016664546495, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5472917556762695, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8683134317398071, + "num_tokens": 537905755.0, + "step": 14094 + }, + { + "epoch": 1.79302887673324, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.9257465600967407, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8760470151901245, + "num_tokens": 537932359.0, + "step": 14095 + }, + { + "epoch": 1.7931560870118306, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5244154930114746, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8805217742919922, + "num_tokens": 537973530.0, + "step": 14096 + }, + { + "epoch": 1.793283297290421, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5089741945266724, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8653618693351746, + "num_tokens": 538017684.0, + "step": 14097 + }, + { + "epoch": 1.7934105075690114, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5533655881881714, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8709758520126343, + "num_tokens": 538054872.0, + "step": 14098 + }, + { + "epoch": 1.793537717847602, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 4.644598007202148, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8768911361694336, + "num_tokens": 538091871.0, + "step": 14099 + }, + { + "epoch": 1.7936649281261925, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4887794256210327, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8821304440498352, + "num_tokens": 538131585.0, + "step": 14100 + }, + { + "epoch": 1.793792138404783, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7298840284347534, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8844982385635376, + "num_tokens": 538164235.0, + "step": 14101 + }, + { + "epoch": 1.7939193486833735, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6499218940734863, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8784996271133423, + "num_tokens": 538198164.0, + "step": 14102 + }, + { + "epoch": 1.794046558961964, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.705879807472229, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8654243350028992, + "num_tokens": 538235219.0, + "step": 14103 + }, + { + "epoch": 1.7941737692405546, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5038337707519531, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8688399791717529, + "num_tokens": 538279862.0, + "step": 14104 + }, + { + "epoch": 1.7943009795191451, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6602545976638794, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.877042293548584, + "num_tokens": 538317292.0, + "step": 14105 + }, + { + "epoch": 1.7944281897977357, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6292834281921387, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8748322129249573, + "num_tokens": 538354416.0, + "step": 14106 + }, + { + "epoch": 1.7945554000763262, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6265619993209839, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.879135251045227, + "num_tokens": 538389353.0, + "step": 14107 + }, + { + "epoch": 1.7946826103549167, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6781353950500488, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8729813098907471, + "num_tokens": 538423759.0, + "step": 14108 + }, + { + "epoch": 1.7948098206335072, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5965849161148071, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8721336126327515, + "num_tokens": 538465285.0, + "step": 14109 + }, + { + "epoch": 1.7949370309120978, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5846277475357056, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8830950856208801, + "num_tokens": 538507518.0, + "step": 14110 + }, + { + "epoch": 1.7950642411906883, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 3.699681520462036, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8721136450767517, + "num_tokens": 538549230.0, + "step": 14111 + }, + { + "epoch": 1.7951914514692788, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7828477621078491, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8784080743789673, + "num_tokens": 538579139.0, + "step": 14112 + }, + { + "epoch": 1.7953186617478694, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5202248096466064, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8691896200180054, + "num_tokens": 538626546.0, + "step": 14113 + }, + { + "epoch": 1.7954458720264599, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5434091091156006, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8788133263587952, + "num_tokens": 538666520.0, + "step": 14114 + }, + { + "epoch": 1.7955730823050502, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6159087419509888, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8696900606155396, + "num_tokens": 538702306.0, + "step": 14115 + }, + { + "epoch": 1.7957002925836407, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5434623956680298, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.869806170463562, + "num_tokens": 538743116.0, + "step": 14116 + }, + { + "epoch": 1.7958275028622313, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4707248210906982, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8634473085403442, + "num_tokens": 538790294.0, + "step": 14117 + }, + { + "epoch": 1.7959547131408218, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6188218593597412, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.866178035736084, + "num_tokens": 538829487.0, + "step": 14118 + }, + { + "epoch": 1.7960819234194123, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.767680048942566, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8602501749992371, + "num_tokens": 538863984.0, + "step": 14119 + }, + { + "epoch": 1.7962091336980028, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5745104551315308, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8507567644119263, + "num_tokens": 538908537.0, + "step": 14120 + }, + { + "epoch": 1.7963363439765934, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7478876113891602, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8633341193199158, + "num_tokens": 538942615.0, + "step": 14121 + }, + { + "epoch": 1.7964635542551837, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5531967878341675, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8642648458480835, + "num_tokens": 538986362.0, + "step": 14122 + }, + { + "epoch": 1.7965907645337742, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6926957368850708, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.884351372718811, + "num_tokens": 539018864.0, + "step": 14123 + }, + { + "epoch": 1.7967179748123647, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5355637073516846, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8717727661132812, + "num_tokens": 539062741.0, + "step": 14124 + }, + { + "epoch": 1.7968451850909553, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5521352291107178, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8728522062301636, + "num_tokens": 539101034.0, + "step": 14125 + }, + { + "epoch": 1.7969723953695458, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.592786431312561, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8668513298034668, + "num_tokens": 539141110.0, + "step": 14126 + }, + { + "epoch": 1.7970996056481363, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.434507966041565, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8774954080581665, + "num_tokens": 539183493.0, + "step": 14127 + }, + { + "epoch": 1.7972268159267268, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5068438053131104, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8765126466751099, + "num_tokens": 539224633.0, + "step": 14128 + }, + { + "epoch": 1.7973540262053174, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6255635023117065, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8848989009857178, + "num_tokens": 539257403.0, + "step": 14129 + }, + { + "epoch": 1.797481236483908, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6521660089492798, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8531920909881592, + "num_tokens": 539294988.0, + "step": 14130 + }, + { + "epoch": 1.7976084467624984, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6860007047653198, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8555277585983276, + "num_tokens": 539330752.0, + "step": 14131 + }, + { + "epoch": 1.797735657041089, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5014948844909668, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8710663318634033, + "num_tokens": 539373818.0, + "step": 14132 + }, + { + "epoch": 1.7978628673196795, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6241408586502075, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8745608329772949, + "num_tokens": 539408042.0, + "step": 14133 + }, + { + "epoch": 1.79799007759827, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6493165493011475, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.871419370174408, + "num_tokens": 539440376.0, + "step": 14134 + }, + { + "epoch": 1.7981172878768605, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7965631484985352, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8718794584274292, + "num_tokens": 539469711.0, + "step": 14135 + }, + { + "epoch": 1.798244498155451, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.633347511291504, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8766152262687683, + "num_tokens": 539510310.0, + "step": 14136 + }, + { + "epoch": 1.7983717084340416, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5964313745498657, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.873152494430542, + "num_tokens": 539545965.0, + "step": 14137 + }, + { + "epoch": 1.7984989187126321, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6131983995437622, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8798495531082153, + "num_tokens": 539579701.0, + "step": 14138 + }, + { + "epoch": 1.7986261289912227, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6047483682632446, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8735420107841492, + "num_tokens": 539619861.0, + "step": 14139 + }, + { + "epoch": 1.798753339269813, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5943999290466309, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8630942106246948, + "num_tokens": 539658334.0, + "step": 14140 + }, + { + "epoch": 1.7988805495484035, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6334049701690674, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8629060983657837, + "num_tokens": 539696931.0, + "step": 14141 + }, + { + "epoch": 1.799007759826994, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4776060581207275, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8902975916862488, + "num_tokens": 539734997.0, + "step": 14142 + }, + { + "epoch": 1.7991349701055845, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6608471870422363, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8681425452232361, + "num_tokens": 539771786.0, + "step": 14143 + }, + { + "epoch": 1.799262180384175, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5601037740707397, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8538620471954346, + "num_tokens": 539814640.0, + "step": 14144 + }, + { + "epoch": 1.7993893906627656, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6037265062332153, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8800531625747681, + "num_tokens": 539849874.0, + "step": 14145 + }, + { + "epoch": 1.799516600941356, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4573038816452026, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8867343664169312, + "num_tokens": 539890548.0, + "step": 14146 + }, + { + "epoch": 1.7996438112199464, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.577379822731018, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.865297794342041, + "num_tokens": 539931870.0, + "step": 14147 + }, + { + "epoch": 1.799771021498537, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 3.6713671684265137, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8655133247375488, + "num_tokens": 539970847.0, + "step": 14148 + }, + { + "epoch": 1.7998982317771275, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6727734804153442, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8564622402191162, + "num_tokens": 540006843.0, + "step": 14149 + }, + { + "epoch": 1.800025442055718, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8200139999389648, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8750308752059937, + "num_tokens": 540038336.0, + "step": 14150 + }, + { + "epoch": 1.8001526523343085, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5466101169586182, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8901528716087341, + "num_tokens": 540074990.0, + "step": 14151 + }, + { + "epoch": 1.800279862612899, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.532651424407959, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8611088395118713, + "num_tokens": 540116268.0, + "step": 14152 + }, + { + "epoch": 1.8004070728914896, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 4.652872562408447, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8690227270126343, + "num_tokens": 540155356.0, + "step": 14153 + }, + { + "epoch": 1.8005342831700801, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5282233953475952, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8799327611923218, + "num_tokens": 540197246.0, + "step": 14154 + }, + { + "epoch": 1.8006614934486707, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5656776428222656, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8683034181594849, + "num_tokens": 540237935.0, + "step": 14155 + }, + { + "epoch": 1.8007887037272612, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5130207538604736, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8757688403129578, + "num_tokens": 540276023.0, + "step": 14156 + }, + { + "epoch": 1.8009159140058517, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.655185341835022, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8725190758705139, + "num_tokens": 540308492.0, + "step": 14157 + }, + { + "epoch": 1.8010431242844422, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6864140033721924, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8883492350578308, + "num_tokens": 540338795.0, + "step": 14158 + }, + { + "epoch": 1.8011703345630328, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.525131344795227, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8701255917549133, + "num_tokens": 540381412.0, + "step": 14159 + }, + { + "epoch": 1.8012975448416233, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6637580394744873, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8646352887153625, + "num_tokens": 540416969.0, + "step": 14160 + }, + { + "epoch": 1.8014247551202138, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.562947154045105, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8691243529319763, + "num_tokens": 540458330.0, + "step": 14161 + }, + { + "epoch": 1.8015519653988044, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5912624597549438, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.872368574142456, + "num_tokens": 540497007.0, + "step": 14162 + }, + { + "epoch": 1.8016791756773949, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6351081132888794, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8603811860084534, + "num_tokens": 540538297.0, + "step": 14163 + }, + { + "epoch": 1.8018063859559852, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.687351942062378, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8650520443916321, + "num_tokens": 540572466.0, + "step": 14164 + }, + { + "epoch": 1.8019335962345757, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5455485582351685, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8766272664070129, + "num_tokens": 540612155.0, + "step": 14165 + }, + { + "epoch": 1.8020608065131662, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5063468217849731, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.881250262260437, + "num_tokens": 540651552.0, + "step": 14166 + }, + { + "epoch": 1.8021880167917568, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7470037937164307, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.861257791519165, + "num_tokens": 540685484.0, + "step": 14167 + }, + { + "epoch": 1.8023152270703473, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7417182922363281, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8654606938362122, + "num_tokens": 540721273.0, + "step": 14168 + }, + { + "epoch": 1.8024424373489378, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6244926452636719, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8575515151023865, + "num_tokens": 540763870.0, + "step": 14169 + }, + { + "epoch": 1.8025696476275284, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6569373607635498, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8632986545562744, + "num_tokens": 540800304.0, + "step": 14170 + }, + { + "epoch": 1.8026968579061187, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 2.511648178100586, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8633603453636169, + "num_tokens": 540838159.0, + "step": 14171 + }, + { + "epoch": 1.8028240681847092, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5100164413452148, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.886279284954071, + "num_tokens": 540878761.0, + "step": 14172 + }, + { + "epoch": 1.8029512784632997, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.640175461769104, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8584550023078918, + "num_tokens": 540916089.0, + "step": 14173 + }, + { + "epoch": 1.8030784887418903, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6347336769104004, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8703389763832092, + "num_tokens": 540953708.0, + "step": 14174 + }, + { + "epoch": 1.8032056990204808, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6430294513702393, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.868471622467041, + "num_tokens": 540990858.0, + "step": 14175 + }, + { + "epoch": 1.8033329092990713, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5365394353866577, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8621090054512024, + "num_tokens": 541032192.0, + "step": 14176 + }, + { + "epoch": 1.8034601195776618, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5351542234420776, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8805257081985474, + "num_tokens": 541072198.0, + "step": 14177 + }, + { + "epoch": 1.8035873298562524, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5264467000961304, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8736493587493896, + "num_tokens": 541113749.0, + "step": 14178 + }, + { + "epoch": 1.803714540134843, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6554841995239258, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8660693168640137, + "num_tokens": 541149550.0, + "step": 14179 + }, + { + "epoch": 1.8038417504134334, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6242645978927612, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.873115599155426, + "num_tokens": 541183009.0, + "step": 14180 + }, + { + "epoch": 1.803968960692024, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5657936334609985, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8672817349433899, + "num_tokens": 541222055.0, + "step": 14181 + }, + { + "epoch": 1.8040961709706145, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6194146871566772, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8803693652153015, + "num_tokens": 541256659.0, + "step": 14182 + }, + { + "epoch": 1.804223381249205, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7330571413040161, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8842188119888306, + "num_tokens": 541287851.0, + "step": 14183 + }, + { + "epoch": 1.8043505915277955, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5513488054275513, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8801819086074829, + "num_tokens": 541329628.0, + "step": 14184 + }, + { + "epoch": 1.804477801806386, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5623195171356201, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8728153109550476, + "num_tokens": 541369811.0, + "step": 14185 + }, + { + "epoch": 1.8046050120849766, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6715712547302246, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.875583291053772, + "num_tokens": 541402711.0, + "step": 14186 + }, + { + "epoch": 1.8047322223635671, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6168948411941528, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8666722178459167, + "num_tokens": 541441767.0, + "step": 14187 + }, + { + "epoch": 1.8048594326421576, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5617817640304565, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8890995979309082, + "num_tokens": 541480562.0, + "step": 14188 + }, + { + "epoch": 1.804986642920748, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7285099029541016, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8786592483520508, + "num_tokens": 541512282.0, + "step": 14189 + }, + { + "epoch": 1.8051138531993385, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.650191307067871, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8695037364959717, + "num_tokens": 541549436.0, + "step": 14190 + }, + { + "epoch": 1.805241063477929, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.696224570274353, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8676918745040894, + "num_tokens": 541587349.0, + "step": 14191 + }, + { + "epoch": 1.8053682737565195, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.620731234550476, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8745267391204834, + "num_tokens": 541626097.0, + "step": 14192 + }, + { + "epoch": 1.80549548403511, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8263819217681885, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8665072321891785, + "num_tokens": 541656888.0, + "step": 14193 + }, + { + "epoch": 1.8056226943137006, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5480284690856934, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8759775161743164, + "num_tokens": 541695899.0, + "step": 14194 + }, + { + "epoch": 1.805749904592291, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5979124307632446, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8650206923484802, + "num_tokens": 541736893.0, + "step": 14195 + }, + { + "epoch": 1.8058771148708814, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6235977411270142, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8798739910125732, + "num_tokens": 541774776.0, + "step": 14196 + }, + { + "epoch": 1.806004325149472, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5034815073013306, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8691028356552124, + "num_tokens": 541817973.0, + "step": 14197 + }, + { + "epoch": 1.8061315354280625, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.504879355430603, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8689088821411133, + "num_tokens": 541862110.0, + "step": 14198 + }, + { + "epoch": 1.806258745706653, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4520787000656128, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.889469563961029, + "num_tokens": 541901523.0, + "step": 14199 + }, + { + "epoch": 1.8063859559852435, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6337308883666992, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.871211051940918, + "num_tokens": 541937760.0, + "step": 14200 + }, + { + "epoch": 1.806513166263834, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6064023971557617, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8809642195701599, + "num_tokens": 541970663.0, + "step": 14201 + }, + { + "epoch": 1.8066403765424246, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.733553409576416, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8572216033935547, + "num_tokens": 542012358.0, + "step": 14202 + }, + { + "epoch": 1.8067675868210151, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6903313398361206, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8749808073043823, + "num_tokens": 542048509.0, + "step": 14203 + }, + { + "epoch": 1.8068947970996057, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5649062395095825, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8560954332351685, + "num_tokens": 542090592.0, + "step": 14204 + }, + { + "epoch": 1.8070220073781962, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6795686483383179, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8722019195556641, + "num_tokens": 542124712.0, + "step": 14205 + }, + { + "epoch": 1.8071492176567867, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5163743495941162, + "learning_rate": 1e-06, + "loss": 0.317, + "mean_token_accuracy": 0.8959869146347046, + "num_tokens": 542165124.0, + "step": 14206 + }, + { + "epoch": 1.8072764279353772, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5776640176773071, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8740086555480957, + "num_tokens": 542201668.0, + "step": 14207 + }, + { + "epoch": 1.8074036382139678, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6161952018737793, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8679379224777222, + "num_tokens": 542242040.0, + "step": 14208 + }, + { + "epoch": 1.8075308484925583, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6066133975982666, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.851967453956604, + "num_tokens": 542284857.0, + "step": 14209 + }, + { + "epoch": 1.8076580587711488, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.52424955368042, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.857598066329956, + "num_tokens": 542325881.0, + "step": 14210 + }, + { + "epoch": 1.8077852690497394, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5451046228408813, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8619433641433716, + "num_tokens": 542366290.0, + "step": 14211 + }, + { + "epoch": 1.8079124793283299, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.616690993309021, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8673799633979797, + "num_tokens": 542401526.0, + "step": 14212 + }, + { + "epoch": 1.8080396896069202, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5611381530761719, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8667132258415222, + "num_tokens": 542442003.0, + "step": 14213 + }, + { + "epoch": 1.8081668998855107, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7397292852401733, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8739974498748779, + "num_tokens": 542472896.0, + "step": 14214 + }, + { + "epoch": 1.8082941101641012, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6090127229690552, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8625270128250122, + "num_tokens": 542513534.0, + "step": 14215 + }, + { + "epoch": 1.8084213204426918, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6343268156051636, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8687994480133057, + "num_tokens": 542551566.0, + "step": 14216 + }, + { + "epoch": 1.8085485307212823, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5850064754486084, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8580594062805176, + "num_tokens": 542590443.0, + "step": 14217 + }, + { + "epoch": 1.8086757409998728, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5962185859680176, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8576828241348267, + "num_tokens": 542628304.0, + "step": 14218 + }, + { + "epoch": 1.8088029512784631, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5350960493087769, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8696743845939636, + "num_tokens": 542670216.0, + "step": 14219 + }, + { + "epoch": 1.8089301615570537, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6434941291809082, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.859350323677063, + "num_tokens": 542708013.0, + "step": 14220 + }, + { + "epoch": 1.8090573718356442, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5454227924346924, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8681446313858032, + "num_tokens": 542746903.0, + "step": 14221 + }, + { + "epoch": 1.8091845821142347, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5373018980026245, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8738545179367065, + "num_tokens": 542787340.0, + "step": 14222 + }, + { + "epoch": 1.8093117923928252, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6235846281051636, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8643961548805237, + "num_tokens": 542826288.0, + "step": 14223 + }, + { + "epoch": 1.8094390026714158, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6408355236053467, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8683669567108154, + "num_tokens": 542863248.0, + "step": 14224 + }, + { + "epoch": 1.8095662129500063, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 2.3539135456085205, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8621852993965149, + "num_tokens": 542898703.0, + "step": 14225 + }, + { + "epoch": 1.8096934232285968, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.550416350364685, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8716473579406738, + "num_tokens": 542937572.0, + "step": 14226 + }, + { + "epoch": 1.8098206335071874, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6415797472000122, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8722050786018372, + "num_tokens": 542972563.0, + "step": 14227 + }, + { + "epoch": 1.8099478437857779, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5375744104385376, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8907356262207031, + "num_tokens": 543009523.0, + "step": 14228 + }, + { + "epoch": 1.8100750540643684, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4459683895111084, + "learning_rate": 1e-06, + "loss": 0.3109, + "mean_token_accuracy": 0.8982639908790588, + "num_tokens": 543047364.0, + "step": 14229 + }, + { + "epoch": 1.810202264342959, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7387127876281738, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8772588968276978, + "num_tokens": 543080914.0, + "step": 14230 + }, + { + "epoch": 1.8103294746215495, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6490883827209473, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8626022338867188, + "num_tokens": 543116212.0, + "step": 14231 + }, + { + "epoch": 1.81045668490014, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5070350170135498, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8766089677810669, + "num_tokens": 543156341.0, + "step": 14232 + }, + { + "epoch": 1.8105838951787305, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6729063987731934, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8686250448226929, + "num_tokens": 543190576.0, + "step": 14233 + }, + { + "epoch": 1.810711105457321, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6108877658843994, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8705346584320068, + "num_tokens": 543228714.0, + "step": 14234 + }, + { + "epoch": 1.8108383157359116, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6704360246658325, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8782020807266235, + "num_tokens": 543266122.0, + "step": 14235 + }, + { + "epoch": 1.8109655260145021, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6228272914886475, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8719442486763, + "num_tokens": 543302983.0, + "step": 14236 + }, + { + "epoch": 1.8110927362930926, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5476821660995483, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8666307926177979, + "num_tokens": 543340862.0, + "step": 14237 + }, + { + "epoch": 1.811219946571683, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8406108617782593, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8618093729019165, + "num_tokens": 543373450.0, + "step": 14238 + }, + { + "epoch": 1.8113471568502735, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6860954761505127, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.867068886756897, + "num_tokens": 543416312.0, + "step": 14239 + }, + { + "epoch": 1.811474367128864, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 2.2828848361968994, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8861021995544434, + "num_tokens": 543451329.0, + "step": 14240 + }, + { + "epoch": 1.8116015774074545, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5416545867919922, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8865203857421875, + "num_tokens": 543492559.0, + "step": 14241 + }, + { + "epoch": 1.811728787686045, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7589384317398071, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8619944453239441, + "num_tokens": 543527842.0, + "step": 14242 + }, + { + "epoch": 1.8118559979646356, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5031360387802124, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8861823081970215, + "num_tokens": 543567969.0, + "step": 14243 + }, + { + "epoch": 1.811983208243226, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5112788677215576, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8727267980575562, + "num_tokens": 543612046.0, + "step": 14244 + }, + { + "epoch": 1.8121104185218164, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.627025842666626, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8616598844528198, + "num_tokens": 543652910.0, + "step": 14245 + }, + { + "epoch": 1.812237628800407, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6449048519134521, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8752213716506958, + "num_tokens": 543687481.0, + "step": 14246 + }, + { + "epoch": 1.8123648390789975, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6704208850860596, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8589165806770325, + "num_tokens": 543724870.0, + "step": 14247 + }, + { + "epoch": 1.812492049357588, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6925863027572632, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8768593072891235, + "num_tokens": 543757740.0, + "step": 14248 + }, + { + "epoch": 1.8126192596361785, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5321552753448486, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8743211627006531, + "num_tokens": 543797191.0, + "step": 14249 + }, + { + "epoch": 1.812746469914769, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.596426248550415, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8587929010391235, + "num_tokens": 543836529.0, + "step": 14250 + }, + { + "epoch": 1.8128736801933596, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5380316972732544, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8729296326637268, + "num_tokens": 543877557.0, + "step": 14251 + }, + { + "epoch": 1.8130008904719501, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5896039009094238, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8642725944519043, + "num_tokens": 543919041.0, + "step": 14252 + }, + { + "epoch": 1.8131281007505406, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5028785467147827, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8717213273048401, + "num_tokens": 543963846.0, + "step": 14253 + }, + { + "epoch": 1.8132553110291312, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.565898060798645, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8716478943824768, + "num_tokens": 544001139.0, + "step": 14254 + }, + { + "epoch": 1.8133825213077217, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5911110639572144, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8802261352539062, + "num_tokens": 544034442.0, + "step": 14255 + }, + { + "epoch": 1.8135097315863122, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.712160348892212, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.869827389717102, + "num_tokens": 544073447.0, + "step": 14256 + }, + { + "epoch": 1.8136369418649028, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4335609674453735, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8723090887069702, + "num_tokens": 544119330.0, + "step": 14257 + }, + { + "epoch": 1.8137641521434933, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7264522314071655, + "learning_rate": 1e-06, + "loss": 0.3387, + "mean_token_accuracy": 0.885602593421936, + "num_tokens": 544147930.0, + "step": 14258 + }, + { + "epoch": 1.8138913624220838, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5980908870697021, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.867972731590271, + "num_tokens": 544184887.0, + "step": 14259 + }, + { + "epoch": 1.8140185727006743, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.496363878250122, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8694006204605103, + "num_tokens": 544227379.0, + "step": 14260 + }, + { + "epoch": 1.8141457829792649, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5447105169296265, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8722408413887024, + "num_tokens": 544267777.0, + "step": 14261 + }, + { + "epoch": 1.8142729932578552, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7147784233093262, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8896933794021606, + "num_tokens": 544299348.0, + "step": 14262 + }, + { + "epoch": 1.8144002035364457, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.672694444656372, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8615232706069946, + "num_tokens": 544335469.0, + "step": 14263 + }, + { + "epoch": 1.8145274138150362, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6415104866027832, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8757084608078003, + "num_tokens": 544369944.0, + "step": 14264 + }, + { + "epoch": 1.8146546240936268, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7611080408096313, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8645896911621094, + "num_tokens": 544406537.0, + "step": 14265 + }, + { + "epoch": 1.8147818343722173, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6676523685455322, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.849302351474762, + "num_tokens": 544445557.0, + "step": 14266 + }, + { + "epoch": 1.8149090446508078, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5060288906097412, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8814544677734375, + "num_tokens": 544489537.0, + "step": 14267 + }, + { + "epoch": 1.8150362549293981, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5381221771240234, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8704357743263245, + "num_tokens": 544528901.0, + "step": 14268 + }, + { + "epoch": 1.8151634652079887, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.530463695526123, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8742088675498962, + "num_tokens": 544568700.0, + "step": 14269 + }, + { + "epoch": 1.8152906754865792, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5211026668548584, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8774850368499756, + "num_tokens": 544609228.0, + "step": 14270 + }, + { + "epoch": 1.8154178857651697, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7442290782928467, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8696041107177734, + "num_tokens": 544646120.0, + "step": 14271 + }, + { + "epoch": 1.8155450960437602, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6020281314849854, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8708537817001343, + "num_tokens": 544683275.0, + "step": 14272 + }, + { + "epoch": 1.8156723063223508, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.551926851272583, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8770467638969421, + "num_tokens": 544727218.0, + "step": 14273 + }, + { + "epoch": 1.8157995166009413, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6907449960708618, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8683253526687622, + "num_tokens": 544762969.0, + "step": 14274 + }, + { + "epoch": 1.8159267268795318, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6126160621643066, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8736284971237183, + "num_tokens": 544800916.0, + "step": 14275 + }, + { + "epoch": 1.8160539371581224, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.666567087173462, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.871833324432373, + "num_tokens": 544836810.0, + "step": 14276 + }, + { + "epoch": 1.8161811474367129, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.600374460220337, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8718380928039551, + "num_tokens": 544873468.0, + "step": 14277 + }, + { + "epoch": 1.8163083577153034, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.642553448677063, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8748781681060791, + "num_tokens": 544912086.0, + "step": 14278 + }, + { + "epoch": 1.816435567993894, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.782271146774292, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8647178411483765, + "num_tokens": 544946452.0, + "step": 14279 + }, + { + "epoch": 1.8165627782724845, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6196209192276, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8731338381767273, + "num_tokens": 544985398.0, + "step": 14280 + }, + { + "epoch": 1.816689988551075, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6118998527526855, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8644580841064453, + "num_tokens": 545025854.0, + "step": 14281 + }, + { + "epoch": 1.8168171988296655, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6755726337432861, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8597628474235535, + "num_tokens": 545061970.0, + "step": 14282 + }, + { + "epoch": 1.816944409108256, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5712368488311768, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.887736439704895, + "num_tokens": 545097535.0, + "step": 14283 + }, + { + "epoch": 1.8170716193868466, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6652199029922485, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.874215304851532, + "num_tokens": 545134336.0, + "step": 14284 + }, + { + "epoch": 1.817198829665437, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.681699514389038, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8628658056259155, + "num_tokens": 545170462.0, + "step": 14285 + }, + { + "epoch": 1.8173260399440276, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.680891990661621, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8517810702323914, + "num_tokens": 545210008.0, + "step": 14286 + }, + { + "epoch": 1.817453250222618, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5471442937850952, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8854027986526489, + "num_tokens": 545248846.0, + "step": 14287 + }, + { + "epoch": 1.8175804605012085, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6123158931732178, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8790655136108398, + "num_tokens": 545285878.0, + "step": 14288 + }, + { + "epoch": 1.817707670779799, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6158303022384644, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8722429275512695, + "num_tokens": 545323984.0, + "step": 14289 + }, + { + "epoch": 1.8178348810583895, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.508945345878601, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8753059506416321, + "num_tokens": 545365577.0, + "step": 14290 + }, + { + "epoch": 1.81796209133698, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.542441487312317, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8757479786872864, + "num_tokens": 545403976.0, + "step": 14291 + }, + { + "epoch": 1.8180893016155706, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6346652507781982, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8774585723876953, + "num_tokens": 545440608.0, + "step": 14292 + }, + { + "epoch": 1.818216511894161, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5511212348937988, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.879370391368866, + "num_tokens": 545481259.0, + "step": 14293 + }, + { + "epoch": 1.8183437221727514, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8154338598251343, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8566806316375732, + "num_tokens": 545514634.0, + "step": 14294 + }, + { + "epoch": 1.818470932451342, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6978871822357178, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8671388030052185, + "num_tokens": 545549993.0, + "step": 14295 + }, + { + "epoch": 1.8185981427299325, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6035487651824951, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.875027596950531, + "num_tokens": 545590022.0, + "step": 14296 + }, + { + "epoch": 1.818725353008523, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.61826491355896, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8684307932853699, + "num_tokens": 545630338.0, + "step": 14297 + }, + { + "epoch": 1.8188525632871135, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.514650583267212, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8899122476577759, + "num_tokens": 545666099.0, + "step": 14298 + }, + { + "epoch": 1.818979773565704, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.549278974533081, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.880283772945404, + "num_tokens": 545706314.0, + "step": 14299 + }, + { + "epoch": 1.8191069838442946, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5667016506195068, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8793900012969971, + "num_tokens": 545743742.0, + "step": 14300 + }, + { + "epoch": 1.8192341941228851, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7062870264053345, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8670756816864014, + "num_tokens": 545779588.0, + "step": 14301 + }, + { + "epoch": 1.8193614044014756, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.684844970703125, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8595770597457886, + "num_tokens": 545816452.0, + "step": 14302 + }, + { + "epoch": 1.8194886146800662, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5396991968154907, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8751627802848816, + "num_tokens": 545858499.0, + "step": 14303 + }, + { + "epoch": 1.8196158249586567, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5993080139160156, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8716945648193359, + "num_tokens": 545892910.0, + "step": 14304 + }, + { + "epoch": 1.8197430352372472, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6096264123916626, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8613130450248718, + "num_tokens": 545932730.0, + "step": 14305 + }, + { + "epoch": 1.8198702455158378, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.621970534324646, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8736009001731873, + "num_tokens": 545968728.0, + "step": 14306 + }, + { + "epoch": 1.8199974557944283, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5790445804595947, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8792874813079834, + "num_tokens": 546009205.0, + "step": 14307 + }, + { + "epoch": 1.8201246660730188, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.679221272468567, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8625537157058716, + "num_tokens": 546045274.0, + "step": 14308 + }, + { + "epoch": 1.8202518763516093, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6729393005371094, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8716399669647217, + "num_tokens": 546080415.0, + "step": 14309 + }, + { + "epoch": 1.8203790866301999, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6286569833755493, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8904087543487549, + "num_tokens": 546116041.0, + "step": 14310 + }, + { + "epoch": 1.8205062969087902, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.468917965888977, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8724756240844727, + "num_tokens": 546155871.0, + "step": 14311 + }, + { + "epoch": 1.8206335071873807, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6574029922485352, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8553757667541504, + "num_tokens": 546196565.0, + "step": 14312 + }, + { + "epoch": 1.8207607174659712, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7020182609558105, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8703661561012268, + "num_tokens": 546230560.0, + "step": 14313 + }, + { + "epoch": 1.8208879277445618, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5885059833526611, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8748117089271545, + "num_tokens": 546268087.0, + "step": 14314 + }, + { + "epoch": 1.8210151380231523, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4854439496994019, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8891830444335938, + "num_tokens": 546309043.0, + "step": 14315 + }, + { + "epoch": 1.8211423483017428, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5128757953643799, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8823174238204956, + "num_tokens": 546346792.0, + "step": 14316 + }, + { + "epoch": 1.8212695585803331, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6110750436782837, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8721617460250854, + "num_tokens": 546382744.0, + "step": 14317 + }, + { + "epoch": 1.8213967688589237, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.571721076965332, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8674306869506836, + "num_tokens": 546420983.0, + "step": 14318 + }, + { + "epoch": 1.8215239791375142, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7718844413757324, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8615583777427673, + "num_tokens": 546452830.0, + "step": 14319 + }, + { + "epoch": 1.8216511894161047, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4961110353469849, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8649584054946899, + "num_tokens": 546498682.0, + "step": 14320 + }, + { + "epoch": 1.8217783996946952, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5350673198699951, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8557125926017761, + "num_tokens": 546539949.0, + "step": 14321 + }, + { + "epoch": 1.8219056099732858, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6277034282684326, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8586414456367493, + "num_tokens": 546579429.0, + "step": 14322 + }, + { + "epoch": 1.8220328202518763, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7115803956985474, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.858455240726471, + "num_tokens": 546613686.0, + "step": 14323 + }, + { + "epoch": 1.8221600305304668, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5450702905654907, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8699633479118347, + "num_tokens": 546649620.0, + "step": 14324 + }, + { + "epoch": 1.8222872408090574, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7526055574417114, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.866270899772644, + "num_tokens": 546686437.0, + "step": 14325 + }, + { + "epoch": 1.8224144510876479, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6621801853179932, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8749909400939941, + "num_tokens": 546727215.0, + "step": 14326 + }, + { + "epoch": 1.8225416613662384, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5668941736221313, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8918572068214417, + "num_tokens": 546763715.0, + "step": 14327 + }, + { + "epoch": 1.822668871644829, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5676064491271973, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8601175546646118, + "num_tokens": 546804350.0, + "step": 14328 + }, + { + "epoch": 1.8227960819234195, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7749321460723877, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8513280153274536, + "num_tokens": 546842227.0, + "step": 14329 + }, + { + "epoch": 1.82292329220201, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5743169784545898, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8571735620498657, + "num_tokens": 546885703.0, + "step": 14330 + }, + { + "epoch": 1.8230505024806005, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5833336114883423, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8889367580413818, + "num_tokens": 546922107.0, + "step": 14331 + }, + { + "epoch": 1.823177712759191, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5605227947235107, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8624172806739807, + "num_tokens": 546960642.0, + "step": 14332 + }, + { + "epoch": 1.8233049230377816, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6799087524414062, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8619431257247925, + "num_tokens": 546998351.0, + "step": 14333 + }, + { + "epoch": 1.823432133316372, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5556082725524902, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8746769428253174, + "num_tokens": 547039463.0, + "step": 14334 + }, + { + "epoch": 1.8235593435949626, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6691367626190186, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8564357161521912, + "num_tokens": 547080905.0, + "step": 14335 + }, + { + "epoch": 1.823686553873553, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5541603565216064, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8770433664321899, + "num_tokens": 547116994.0, + "step": 14336 + }, + { + "epoch": 1.8238137641521435, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6504439115524292, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8589686751365662, + "num_tokens": 547155893.0, + "step": 14337 + }, + { + "epoch": 1.823940974430734, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.502939224243164, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.867616593837738, + "num_tokens": 547196722.0, + "step": 14338 + }, + { + "epoch": 1.8240681847093245, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6348708868026733, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8507525324821472, + "num_tokens": 547237723.0, + "step": 14339 + }, + { + "epoch": 1.824195394987915, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6137974262237549, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8652026653289795, + "num_tokens": 547277240.0, + "step": 14340 + }, + { + "epoch": 1.8243226052665056, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6560180187225342, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8694913983345032, + "num_tokens": 547315088.0, + "step": 14341 + }, + { + "epoch": 1.8244498155450959, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6727209091186523, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8562153577804565, + "num_tokens": 547353183.0, + "step": 14342 + }, + { + "epoch": 1.8245770258236864, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6701092720031738, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8676311373710632, + "num_tokens": 547389678.0, + "step": 14343 + }, + { + "epoch": 1.824704236102277, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5187586545944214, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8740913271903992, + "num_tokens": 547430529.0, + "step": 14344 + }, + { + "epoch": 1.8248314463808675, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6573879718780518, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.875838041305542, + "num_tokens": 547464663.0, + "step": 14345 + }, + { + "epoch": 1.824958656659458, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5504440069198608, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8756071329116821, + "num_tokens": 547502659.0, + "step": 14346 + }, + { + "epoch": 1.8250858669380485, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.3745542764663696, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8813600540161133, + "num_tokens": 547548828.0, + "step": 14347 + }, + { + "epoch": 1.825213077216639, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6623976230621338, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8623895645141602, + "num_tokens": 547584529.0, + "step": 14348 + }, + { + "epoch": 1.8253402874952296, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6035244464874268, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8693137764930725, + "num_tokens": 547623093.0, + "step": 14349 + }, + { + "epoch": 1.8254674977738201, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6783932447433472, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8748512268066406, + "num_tokens": 547661141.0, + "step": 14350 + }, + { + "epoch": 1.8255947080524106, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7053648233413696, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8529975414276123, + "num_tokens": 547698454.0, + "step": 14351 + }, + { + "epoch": 1.8257219183310012, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.564228892326355, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8589194416999817, + "num_tokens": 547739915.0, + "step": 14352 + }, + { + "epoch": 1.8258491286095917, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.558605670928955, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8858835101127625, + "num_tokens": 547775025.0, + "step": 14353 + }, + { + "epoch": 1.8259763388881822, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6034098863601685, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8696615695953369, + "num_tokens": 547808948.0, + "step": 14354 + }, + { + "epoch": 1.8261035491667728, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.3598309755325317, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8878571391105652, + "num_tokens": 547852562.0, + "step": 14355 + }, + { + "epoch": 1.8262307594453633, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.548804759979248, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8747861385345459, + "num_tokens": 547889666.0, + "step": 14356 + }, + { + "epoch": 1.8263579697239538, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6016498804092407, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8866597414016724, + "num_tokens": 547923095.0, + "step": 14357 + }, + { + "epoch": 1.8264851800025443, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5258980989456177, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8678271770477295, + "num_tokens": 547963549.0, + "step": 14358 + }, + { + "epoch": 1.8266123902811349, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7201969623565674, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8693304657936096, + "num_tokens": 547996440.0, + "step": 14359 + }, + { + "epoch": 1.8267396005597252, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5676500797271729, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8703742027282715, + "num_tokens": 548034377.0, + "step": 14360 + }, + { + "epoch": 1.8268668108383157, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5166031122207642, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8781790733337402, + "num_tokens": 548075918.0, + "step": 14361 + }, + { + "epoch": 1.8269940211169062, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5323975086212158, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8645598888397217, + "num_tokens": 548117436.0, + "step": 14362 + }, + { + "epoch": 1.8271212313954968, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5896251201629639, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8794275522232056, + "num_tokens": 548159049.0, + "step": 14363 + }, + { + "epoch": 1.8272484416740873, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.520479679107666, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8634865283966064, + "num_tokens": 548202531.0, + "step": 14364 + }, + { + "epoch": 1.8273756519526778, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.815521001815796, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8618329763412476, + "num_tokens": 548234707.0, + "step": 14365 + }, + { + "epoch": 1.8275028622312681, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.638647198677063, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8632036447525024, + "num_tokens": 548276256.0, + "step": 14366 + }, + { + "epoch": 1.8276300725098586, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.63478684425354, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8846715688705444, + "num_tokens": 548311908.0, + "step": 14367 + }, + { + "epoch": 1.8277572827884492, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6682872772216797, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8653042316436768, + "num_tokens": 548354289.0, + "step": 14368 + }, + { + "epoch": 1.8278844930670397, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5967925786972046, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8702333569526672, + "num_tokens": 548402740.0, + "step": 14369 + }, + { + "epoch": 1.8280117033456302, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8027116060256958, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8730995059013367, + "num_tokens": 548440152.0, + "step": 14370 + }, + { + "epoch": 1.8281389136242208, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5054856538772583, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8770320415496826, + "num_tokens": 548481820.0, + "step": 14371 + }, + { + "epoch": 1.8282661239028113, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.620843529701233, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8675981760025024, + "num_tokens": 548523489.0, + "step": 14372 + }, + { + "epoch": 1.8283933341814018, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7663941383361816, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8709884881973267, + "num_tokens": 548555863.0, + "step": 14373 + }, + { + "epoch": 1.8285205444599923, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.753258466720581, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8644968271255493, + "num_tokens": 548589182.0, + "step": 14374 + }, + { + "epoch": 1.8286477547385829, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6607863903045654, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8710507750511169, + "num_tokens": 548623279.0, + "step": 14375 + }, + { + "epoch": 1.8287749650171734, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6289013624191284, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.873432457447052, + "num_tokens": 548660071.0, + "step": 14376 + }, + { + "epoch": 1.828902175295764, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5430257320404053, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8596723079681396, + "num_tokens": 548701375.0, + "step": 14377 + }, + { + "epoch": 1.8290293855743545, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.46259605884552, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8746094703674316, + "num_tokens": 548741964.0, + "step": 14378 + }, + { + "epoch": 1.829156595852945, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5646263360977173, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.871698260307312, + "num_tokens": 548784166.0, + "step": 14379 + }, + { + "epoch": 1.8292838061315355, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7566312551498413, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8726352453231812, + "num_tokens": 548817306.0, + "step": 14380 + }, + { + "epoch": 1.829411016410126, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7362409830093384, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.854827880859375, + "num_tokens": 548853005.0, + "step": 14381 + }, + { + "epoch": 1.8295382266887166, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.678589940071106, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8642706871032715, + "num_tokens": 548889753.0, + "step": 14382 + }, + { + "epoch": 1.829665436967307, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5427182912826538, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8729840517044067, + "num_tokens": 548928513.0, + "step": 14383 + }, + { + "epoch": 1.8297926472458976, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5143942832946777, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8731478452682495, + "num_tokens": 548973837.0, + "step": 14384 + }, + { + "epoch": 1.829919857524488, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4610817432403564, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8875675201416016, + "num_tokens": 549014161.0, + "step": 14385 + }, + { + "epoch": 1.8300470678030785, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5409489870071411, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.864960253238678, + "num_tokens": 549059099.0, + "step": 14386 + }, + { + "epoch": 1.830174278081669, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.8114237785339355, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8627177476882935, + "num_tokens": 549092497.0, + "step": 14387 + }, + { + "epoch": 1.8303014883602595, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7047200202941895, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8584069013595581, + "num_tokens": 549131038.0, + "step": 14388 + }, + { + "epoch": 1.83042869863885, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7732062339782715, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8916503190994263, + "num_tokens": 549159642.0, + "step": 14389 + }, + { + "epoch": 1.8305559089174406, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6239303350448608, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8703068494796753, + "num_tokens": 549194569.0, + "step": 14390 + }, + { + "epoch": 1.8306831191960309, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7318711280822754, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8747730255126953, + "num_tokens": 549228549.0, + "step": 14391 + }, + { + "epoch": 1.8308103294746214, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6752259731292725, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8599079251289368, + "num_tokens": 549266346.0, + "step": 14392 + }, + { + "epoch": 1.830937539753212, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6464407444000244, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8795852661132812, + "num_tokens": 549303625.0, + "step": 14393 + }, + { + "epoch": 1.8310647500318025, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6157050132751465, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8687077760696411, + "num_tokens": 549341561.0, + "step": 14394 + }, + { + "epoch": 1.831191960310393, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5438777208328247, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8758783340454102, + "num_tokens": 549379492.0, + "step": 14395 + }, + { + "epoch": 1.8313191705889835, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6607879400253296, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8704302310943604, + "num_tokens": 549416202.0, + "step": 14396 + }, + { + "epoch": 1.831446380867574, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7441012859344482, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8858241438865662, + "num_tokens": 549451197.0, + "step": 14397 + }, + { + "epoch": 1.8315735911461646, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6450762748718262, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8780483603477478, + "num_tokens": 549492205.0, + "step": 14398 + }, + { + "epoch": 1.831700801424755, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5313876867294312, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8770942687988281, + "num_tokens": 549532611.0, + "step": 14399 + }, + { + "epoch": 1.8318280117033456, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5515625476837158, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8759229183197021, + "num_tokens": 549570771.0, + "step": 14400 + }, + { + "epoch": 1.8319552219819362, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7212785482406616, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8752252459526062, + "num_tokens": 549602539.0, + "step": 14401 + }, + { + "epoch": 1.8320824322605267, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5704119205474854, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8778603076934814, + "num_tokens": 549642050.0, + "step": 14402 + }, + { + "epoch": 1.8322096425391172, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5200302600860596, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8853479623794556, + "num_tokens": 549679087.0, + "step": 14403 + }, + { + "epoch": 1.8323368528177078, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5999494791030884, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8821958899497986, + "num_tokens": 549714623.0, + "step": 14404 + }, + { + "epoch": 1.8324640630962983, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7235242128372192, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8619997501373291, + "num_tokens": 549753213.0, + "step": 14405 + }, + { + "epoch": 1.8325912733748888, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5779956579208374, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8816878199577332, + "num_tokens": 549790946.0, + "step": 14406 + }, + { + "epoch": 1.8327184836534793, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.52108895778656, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8771474957466125, + "num_tokens": 549830901.0, + "step": 14407 + }, + { + "epoch": 1.8328456939320699, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.52769935131073, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8809324502944946, + "num_tokens": 549870259.0, + "step": 14408 + }, + { + "epoch": 1.8329729042106602, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.753934621810913, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8755793571472168, + "num_tokens": 549902384.0, + "step": 14409 + }, + { + "epoch": 1.8331001144892507, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.609924077987671, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8714364767074585, + "num_tokens": 549939429.0, + "step": 14410 + }, + { + "epoch": 1.8332273247678412, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.815593957901001, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8768741488456726, + "num_tokens": 549970149.0, + "step": 14411 + }, + { + "epoch": 1.8333545350464318, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.511926531791687, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8859501481056213, + "num_tokens": 550009348.0, + "step": 14412 + }, + { + "epoch": 1.8334817453250223, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 16.83147430419922, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.849724531173706, + "num_tokens": 550052360.0, + "step": 14413 + }, + { + "epoch": 1.8336089556036128, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 2.1884889602661133, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8689038753509521, + "num_tokens": 550092356.0, + "step": 14414 + }, + { + "epoch": 1.8337361658822031, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5663859844207764, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8701391220092773, + "num_tokens": 550131534.0, + "step": 14415 + }, + { + "epoch": 1.8338633761607936, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.841868281364441, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8516848683357239, + "num_tokens": 550170333.0, + "step": 14416 + }, + { + "epoch": 1.8339905864393842, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.653180480003357, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8690328598022461, + "num_tokens": 550206170.0, + "step": 14417 + }, + { + "epoch": 1.8341177967179747, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5765233039855957, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8677945137023926, + "num_tokens": 550245312.0, + "step": 14418 + }, + { + "epoch": 1.8342450069965652, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6510262489318848, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8609622716903687, + "num_tokens": 550284598.0, + "step": 14419 + }, + { + "epoch": 1.8343722172751558, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5735571384429932, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8702065944671631, + "num_tokens": 550320703.0, + "step": 14420 + }, + { + "epoch": 1.8344994275537463, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.460156798362732, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8678767681121826, + "num_tokens": 550362316.0, + "step": 14421 + }, + { + "epoch": 1.8346266378323368, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5831815004348755, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8626725673675537, + "num_tokens": 550403504.0, + "step": 14422 + }, + { + "epoch": 1.8347538481109273, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5601084232330322, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8797231912612915, + "num_tokens": 550440402.0, + "step": 14423 + }, + { + "epoch": 1.8348810583895179, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.421608567237854, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8835947513580322, + "num_tokens": 550480174.0, + "step": 14424 + }, + { + "epoch": 1.8350082686681084, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6471444368362427, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8776031732559204, + "num_tokens": 550513760.0, + "step": 14425 + }, + { + "epoch": 1.835135478946699, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.740206003189087, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8618420362472534, + "num_tokens": 550548088.0, + "step": 14426 + }, + { + "epoch": 1.8352626892252895, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6508389711380005, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8733710050582886, + "num_tokens": 550582770.0, + "step": 14427 + }, + { + "epoch": 1.83538989950388, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6348053216934204, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8683050274848938, + "num_tokens": 550619519.0, + "step": 14428 + }, + { + "epoch": 1.8355171097824705, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5574233531951904, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8592919111251831, + "num_tokens": 550663770.0, + "step": 14429 + }, + { + "epoch": 1.835644320061061, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6096551418304443, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.878893256187439, + "num_tokens": 550698789.0, + "step": 14430 + }, + { + "epoch": 1.8357715303396516, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5020657777786255, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8808447122573853, + "num_tokens": 550739472.0, + "step": 14431 + }, + { + "epoch": 1.835898740618242, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5190376043319702, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8841186761856079, + "num_tokens": 550778322.0, + "step": 14432 + }, + { + "epoch": 1.8360259508968326, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5812894105911255, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8798661231994629, + "num_tokens": 550819611.0, + "step": 14433 + }, + { + "epoch": 1.836153161175423, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5816681385040283, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8670268058776855, + "num_tokens": 550859909.0, + "step": 14434 + }, + { + "epoch": 1.8362803714540135, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5899893045425415, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8792534470558167, + "num_tokens": 550898897.0, + "step": 14435 + }, + { + "epoch": 1.836407581732604, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6155171394348145, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8768427968025208, + "num_tokens": 550933272.0, + "step": 14436 + }, + { + "epoch": 1.8365347920111945, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6232709884643555, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8618329763412476, + "num_tokens": 550971099.0, + "step": 14437 + }, + { + "epoch": 1.836662002289785, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5652834177017212, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.867415189743042, + "num_tokens": 551009011.0, + "step": 14438 + }, + { + "epoch": 1.8367892125683756, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6612250804901123, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8712058067321777, + "num_tokens": 551044696.0, + "step": 14439 + }, + { + "epoch": 1.8369164228469659, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5884926319122314, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.867453932762146, + "num_tokens": 551082746.0, + "step": 14440 + }, + { + "epoch": 1.8370436331255564, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4954131841659546, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.874649167060852, + "num_tokens": 551122039.0, + "step": 14441 + }, + { + "epoch": 1.837170843404147, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.685517430305481, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8726013898849487, + "num_tokens": 551158003.0, + "step": 14442 + }, + { + "epoch": 1.8372980536827375, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5920416116714478, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.878301739692688, + "num_tokens": 551193601.0, + "step": 14443 + }, + { + "epoch": 1.837425263961328, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6481397151947021, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8744088411331177, + "num_tokens": 551238249.0, + "step": 14444 + }, + { + "epoch": 1.8375524742399185, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.514304518699646, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8808743357658386, + "num_tokens": 551276696.0, + "step": 14445 + }, + { + "epoch": 1.837679684518509, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4837206602096558, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8799965381622314, + "num_tokens": 551315578.0, + "step": 14446 + }, + { + "epoch": 1.8378068947970996, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5096180438995361, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8749991655349731, + "num_tokens": 551355373.0, + "step": 14447 + }, + { + "epoch": 1.83793410507569, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7020436525344849, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.862981915473938, + "num_tokens": 551392575.0, + "step": 14448 + }, + { + "epoch": 1.8380613153542806, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6739535331726074, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8731098771095276, + "num_tokens": 551428156.0, + "step": 14449 + }, + { + "epoch": 1.8381885256328712, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6038920879364014, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8667320013046265, + "num_tokens": 551470036.0, + "step": 14450 + }, + { + "epoch": 1.8383157359114617, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6354669332504272, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8598591685295105, + "num_tokens": 551509088.0, + "step": 14451 + }, + { + "epoch": 1.8384429461900522, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6513023376464844, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8704252243041992, + "num_tokens": 551544671.0, + "step": 14452 + }, + { + "epoch": 1.8385701564686427, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4952207803726196, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8678047060966492, + "num_tokens": 551588071.0, + "step": 14453 + }, + { + "epoch": 1.8386973667472333, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5638916492462158, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.884535551071167, + "num_tokens": 551624060.0, + "step": 14454 + }, + { + "epoch": 1.8388245770258238, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.640734076499939, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8737282752990723, + "num_tokens": 551660512.0, + "step": 14455 + }, + { + "epoch": 1.8389517873044143, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.551343321800232, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8759416341781616, + "num_tokens": 551701533.0, + "step": 14456 + }, + { + "epoch": 1.8390789975830049, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5062220096588135, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8765809535980225, + "num_tokens": 551742478.0, + "step": 14457 + }, + { + "epoch": 1.8392062078615952, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5700849294662476, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.857408881187439, + "num_tokens": 551785920.0, + "step": 14458 + }, + { + "epoch": 1.8393334181401857, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.5852078199386597, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8664295673370361, + "num_tokens": 551823683.0, + "step": 14459 + }, + { + "epoch": 1.8394606284187762, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6151926517486572, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8644598126411438, + "num_tokens": 551862729.0, + "step": 14460 + }, + { + "epoch": 1.8395878386973668, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6043272018432617, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8752365112304688, + "num_tokens": 551900924.0, + "step": 14461 + }, + { + "epoch": 1.8397150489759573, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.696229100227356, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8646408319473267, + "num_tokens": 551938391.0, + "step": 14462 + }, + { + "epoch": 1.8398422592545478, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6786086559295654, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8764838576316833, + "num_tokens": 551977328.0, + "step": 14463 + }, + { + "epoch": 1.8399694695331381, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6822739839553833, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8659552335739136, + "num_tokens": 552021867.0, + "step": 14464 + }, + { + "epoch": 1.8400966798117286, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6551142930984497, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8674430251121521, + "num_tokens": 552060187.0, + "step": 14465 + }, + { + "epoch": 1.8402238900903192, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.4889167547225952, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.861093282699585, + "num_tokens": 552105314.0, + "step": 14466 + }, + { + "epoch": 1.8403511003689097, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6663594245910645, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8738516569137573, + "num_tokens": 552143182.0, + "step": 14467 + }, + { + "epoch": 1.8404783106475002, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6783348321914673, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8768928050994873, + "num_tokens": 552175114.0, + "step": 14468 + }, + { + "epoch": 1.8406055209260908, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.6626124382019043, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8703679442405701, + "num_tokens": 552214310.0, + "step": 14469 + }, + { + "epoch": 1.8407327312046813, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.626586675643921, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8746124505996704, + "num_tokens": 552254034.0, + "step": 14470 + }, + { + "epoch": 1.8408599414832718, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7448121309280396, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8705049157142639, + "num_tokens": 552289683.0, + "step": 14471 + }, + { + "epoch": 1.8409871517618623, + "ewc_loss": 2.4557113647460938e-05, + "grad_norm": 1.7625468969345093, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.857231855392456, + "num_tokens": 552324530.0, + "step": 14472 + }, + { + "epoch": 1.8411143620404529, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6800618171691895, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8694218397140503, + "num_tokens": 552360398.0, + "step": 14473 + }, + { + "epoch": 1.8412415723190434, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5513708591461182, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8756458163261414, + "num_tokens": 552398997.0, + "step": 14474 + }, + { + "epoch": 1.841368782597634, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6623499393463135, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8791933059692383, + "num_tokens": 552437024.0, + "step": 14475 + }, + { + "epoch": 1.8414959928762245, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6442553997039795, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8749183416366577, + "num_tokens": 552474564.0, + "step": 14476 + }, + { + "epoch": 1.841623203154815, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.7401798963546753, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8653771877288818, + "num_tokens": 552513034.0, + "step": 14477 + }, + { + "epoch": 1.8417504134334055, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.766648292541504, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8696808815002441, + "num_tokens": 552544694.0, + "step": 14478 + }, + { + "epoch": 1.841877623711996, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5757777690887451, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8824721574783325, + "num_tokens": 552579402.0, + "step": 14479 + }, + { + "epoch": 1.8420048339905866, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5723479986190796, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8705956935882568, + "num_tokens": 552618086.0, + "step": 14480 + }, + { + "epoch": 1.842132044269177, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6094495058059692, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8759251832962036, + "num_tokens": 552655946.0, + "step": 14481 + }, + { + "epoch": 1.8422592545477676, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.692095160484314, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8619712591171265, + "num_tokens": 552693719.0, + "step": 14482 + }, + { + "epoch": 1.842386464826358, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.4941296577453613, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.883094072341919, + "num_tokens": 552733626.0, + "step": 14483 + }, + { + "epoch": 1.8425136751049485, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6240348815917969, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.873958170413971, + "num_tokens": 552768063.0, + "step": 14484 + }, + { + "epoch": 1.842640885383539, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.7464759349822998, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8648321628570557, + "num_tokens": 552805014.0, + "step": 14485 + }, + { + "epoch": 1.8427680956621295, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6224110126495361, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8699014782905579, + "num_tokens": 552844942.0, + "step": 14486 + }, + { + "epoch": 1.84289530594072, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6413853168487549, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8712073564529419, + "num_tokens": 552882047.0, + "step": 14487 + }, + { + "epoch": 1.8430225162193106, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.7462725639343262, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8398484587669373, + "num_tokens": 552924676.0, + "step": 14488 + }, + { + "epoch": 1.8431497264979009, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6997075080871582, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8883466720581055, + "num_tokens": 552958040.0, + "step": 14489 + }, + { + "epoch": 1.8432769367764914, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.7269668579101562, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8664548993110657, + "num_tokens": 552991451.0, + "step": 14490 + }, + { + "epoch": 1.843404147055082, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.4146820306777954, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8786330223083496, + "num_tokens": 553035799.0, + "step": 14491 + }, + { + "epoch": 1.8435313573336725, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5563764572143555, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8883025050163269, + "num_tokens": 553074456.0, + "step": 14492 + }, + { + "epoch": 1.843658567612263, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6879680156707764, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8751488327980042, + "num_tokens": 553107216.0, + "step": 14493 + }, + { + "epoch": 1.8437857778908535, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.4684468507766724, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8774017095565796, + "num_tokens": 553150558.0, + "step": 14494 + }, + { + "epoch": 1.843912988169444, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6967153549194336, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8717157244682312, + "num_tokens": 553184565.0, + "step": 14495 + }, + { + "epoch": 1.8440401984480346, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6613290309906006, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8708996176719666, + "num_tokens": 553219830.0, + "step": 14496 + }, + { + "epoch": 1.844167408726625, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.4231951236724854, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8694376349449158, + "num_tokens": 553266490.0, + "step": 14497 + }, + { + "epoch": 1.8442946190052156, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.4875773191452026, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8699711561203003, + "num_tokens": 553307890.0, + "step": 14498 + }, + { + "epoch": 1.8444218292838062, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.4971981048583984, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.869606614112854, + "num_tokens": 553350548.0, + "step": 14499 + }, + { + "epoch": 1.8445490395623967, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6506353616714478, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8629605770111084, + "num_tokens": 553387859.0, + "step": 14500 + }, + { + "epoch": 1.8446762498409872, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6205493211746216, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8739851713180542, + "num_tokens": 553426283.0, + "step": 14501 + }, + { + "epoch": 1.8448034601195777, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6407722234725952, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8768803477287292, + "num_tokens": 553465456.0, + "step": 14502 + }, + { + "epoch": 1.8449306703981683, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5262717008590698, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8735964894294739, + "num_tokens": 553503321.0, + "step": 14503 + }, + { + "epoch": 1.8450578806767588, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6564847230911255, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8788405656814575, + "num_tokens": 553540932.0, + "step": 14504 + }, + { + "epoch": 1.8451850909553493, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6021881103515625, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.875281572341919, + "num_tokens": 553578487.0, + "step": 14505 + }, + { + "epoch": 1.8453123012339399, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5204070806503296, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.872025191783905, + "num_tokens": 553620964.0, + "step": 14506 + }, + { + "epoch": 1.8454395115125302, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.574327826499939, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8740005493164062, + "num_tokens": 553665661.0, + "step": 14507 + }, + { + "epoch": 1.8455667217911207, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6045629978179932, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8634687066078186, + "num_tokens": 553704271.0, + "step": 14508 + }, + { + "epoch": 1.8456939320697112, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 2.312403678894043, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8648735284805298, + "num_tokens": 553744584.0, + "step": 14509 + }, + { + "epoch": 1.8458211423483017, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.8273284435272217, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8686283826828003, + "num_tokens": 553776074.0, + "step": 14510 + }, + { + "epoch": 1.8459483526268923, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.56436288356781, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8649176955223083, + "num_tokens": 553817375.0, + "step": 14511 + }, + { + "epoch": 1.8460755629054828, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6268032789230347, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8610253930091858, + "num_tokens": 553856112.0, + "step": 14512 + }, + { + "epoch": 1.846202773184073, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.694480061531067, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8776493668556213, + "num_tokens": 553895193.0, + "step": 14513 + }, + { + "epoch": 1.8463299834626636, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6337544918060303, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8458711504936218, + "num_tokens": 553936044.0, + "step": 14514 + }, + { + "epoch": 1.8464571937412542, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5516853332519531, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8584136962890625, + "num_tokens": 553976461.0, + "step": 14515 + }, + { + "epoch": 1.8465844040198447, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5597747564315796, + "learning_rate": 1e-06, + "loss": 0.3216, + "mean_token_accuracy": 0.8952369689941406, + "num_tokens": 554010838.0, + "step": 14516 + }, + { + "epoch": 1.8467116142984352, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.745350956916809, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.858603835105896, + "num_tokens": 554044230.0, + "step": 14517 + }, + { + "epoch": 1.8468388245770258, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5406357049942017, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8762088418006897, + "num_tokens": 554080730.0, + "step": 14518 + }, + { + "epoch": 1.8469660348556163, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.566957712173462, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8560022711753845, + "num_tokens": 554122518.0, + "step": 14519 + }, + { + "epoch": 1.8470932451342068, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.7922558784484863, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8588361740112305, + "num_tokens": 554162107.0, + "step": 14520 + }, + { + "epoch": 1.8472204554127973, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.568108081817627, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.877220630645752, + "num_tokens": 554203194.0, + "step": 14521 + }, + { + "epoch": 1.8473476656913879, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.613912582397461, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8828138113021851, + "num_tokens": 554239391.0, + "step": 14522 + }, + { + "epoch": 1.8474748759699784, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.654850959777832, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8653825521469116, + "num_tokens": 554274199.0, + "step": 14523 + }, + { + "epoch": 1.847602086248569, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.7913875579833984, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8633284568786621, + "num_tokens": 554314349.0, + "step": 14524 + }, + { + "epoch": 1.8477292965271594, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5986509323120117, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8770239353179932, + "num_tokens": 554353760.0, + "step": 14525 + }, + { + "epoch": 1.84785650680575, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6734758615493774, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8818435668945312, + "num_tokens": 554389594.0, + "step": 14526 + }, + { + "epoch": 1.8479837170843405, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6223300695419312, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8723832368850708, + "num_tokens": 554424918.0, + "step": 14527 + }, + { + "epoch": 1.848110927362931, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6816400289535522, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8681177496910095, + "num_tokens": 554459790.0, + "step": 14528 + }, + { + "epoch": 1.8482381376415216, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.549080491065979, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8788458108901978, + "num_tokens": 554496789.0, + "step": 14529 + }, + { + "epoch": 1.848365347920112, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.692528247833252, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8650332093238831, + "num_tokens": 554529725.0, + "step": 14530 + }, + { + "epoch": 1.8484925581987026, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.7152705192565918, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8729586601257324, + "num_tokens": 554563234.0, + "step": 14531 + }, + { + "epoch": 1.848619768477293, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6833665370941162, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8687769770622253, + "num_tokens": 554598069.0, + "step": 14532 + }, + { + "epoch": 1.8487469787558835, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.6684612035751343, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.857266902923584, + "num_tokens": 554639952.0, + "step": 14533 + }, + { + "epoch": 1.848874189034474, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5796724557876587, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.872809886932373, + "num_tokens": 554676333.0, + "step": 14534 + }, + { + "epoch": 1.8490013993130645, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.582864761352539, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8762578964233398, + "num_tokens": 554718688.0, + "step": 14535 + }, + { + "epoch": 1.849128609591655, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.4825795888900757, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8775210976600647, + "num_tokens": 554757780.0, + "step": 14536 + }, + { + "epoch": 1.8492558198702456, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5531421899795532, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8743193745613098, + "num_tokens": 554796437.0, + "step": 14537 + }, + { + "epoch": 1.8493830301488359, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5016181468963623, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8817036151885986, + "num_tokens": 554834006.0, + "step": 14538 + }, + { + "epoch": 1.8495102404274264, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5972497463226318, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8674489259719849, + "num_tokens": 554870631.0, + "step": 14539 + }, + { + "epoch": 1.849637450706017, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5657943487167358, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8726488351821899, + "num_tokens": 554912485.0, + "step": 14540 + }, + { + "epoch": 1.8497646609846075, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.4908523559570312, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8682256937026978, + "num_tokens": 554951907.0, + "step": 14541 + }, + { + "epoch": 1.849891871263198, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 6.696669101715088, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8654988408088684, + "num_tokens": 554985364.0, + "step": 14542 + }, + { + "epoch": 1.8500190815417885, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.5642633438110352, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8702684640884399, + "num_tokens": 555024471.0, + "step": 14543 + }, + { + "epoch": 1.850146291820379, + "ewc_loss": 2.467632293701172e-05, + "grad_norm": 1.7499048709869385, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8604211807250977, + "num_tokens": 555058732.0, + "step": 14544 + }, + { + "epoch": 1.8502735020989696, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5645934343338013, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8619145154953003, + "num_tokens": 555102083.0, + "step": 14545 + }, + { + "epoch": 1.85040071237756, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6181848049163818, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.873746931552887, + "num_tokens": 555137649.0, + "step": 14546 + }, + { + "epoch": 1.8505279226561506, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5969109535217285, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8707818984985352, + "num_tokens": 555175716.0, + "step": 14547 + }, + { + "epoch": 1.8506551329347412, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5753353834152222, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8747684359550476, + "num_tokens": 555214971.0, + "step": 14548 + }, + { + "epoch": 1.8507823432133317, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7215853929519653, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8714578151702881, + "num_tokens": 555249548.0, + "step": 14549 + }, + { + "epoch": 1.8509095534919222, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6104321479797363, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8735181093215942, + "num_tokens": 555287870.0, + "step": 14550 + }, + { + "epoch": 1.8510367637705127, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6612640619277954, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8819882869720459, + "num_tokens": 555321412.0, + "step": 14551 + }, + { + "epoch": 1.8511639740491033, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6833102703094482, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8719178438186646, + "num_tokens": 555354350.0, + "step": 14552 + }, + { + "epoch": 1.8512911843276938, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5615404844284058, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8739514946937561, + "num_tokens": 555393637.0, + "step": 14553 + }, + { + "epoch": 1.8514183946062843, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.668550729751587, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8597347736358643, + "num_tokens": 555429174.0, + "step": 14554 + }, + { + "epoch": 1.8515456048848749, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7296075820922852, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8795249462127686, + "num_tokens": 555465262.0, + "step": 14555 + }, + { + "epoch": 1.8516728151634652, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6766078472137451, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8526067137718201, + "num_tokens": 555504429.0, + "step": 14556 + }, + { + "epoch": 1.8518000254420557, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5863689184188843, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8569546341896057, + "num_tokens": 555545711.0, + "step": 14557 + }, + { + "epoch": 1.8519272357206462, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.617363452911377, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8647725582122803, + "num_tokens": 555583956.0, + "step": 14558 + }, + { + "epoch": 1.8520544459992367, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.658682107925415, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8578875660896301, + "num_tokens": 555622615.0, + "step": 14559 + }, + { + "epoch": 1.8521816562778273, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.703922152519226, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8678122758865356, + "num_tokens": 555659004.0, + "step": 14560 + }, + { + "epoch": 1.8523088665564178, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6850306987762451, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8679152727127075, + "num_tokens": 555693281.0, + "step": 14561 + }, + { + "epoch": 1.852436076835008, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5411783456802368, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8643579483032227, + "num_tokens": 555734831.0, + "step": 14562 + }, + { + "epoch": 1.8525632871135986, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5042625665664673, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8738726377487183, + "num_tokens": 555774208.0, + "step": 14563 + }, + { + "epoch": 1.8526904973921892, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6894303560256958, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.871418297290802, + "num_tokens": 555807757.0, + "step": 14564 + }, + { + "epoch": 1.8528177076707797, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.638048768043518, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8579808473587036, + "num_tokens": 555848329.0, + "step": 14565 + }, + { + "epoch": 1.8529449179493702, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5825425386428833, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8821907043457031, + "num_tokens": 555884855.0, + "step": 14566 + }, + { + "epoch": 1.8530721282279607, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6770033836364746, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8700270056724548, + "num_tokens": 555920591.0, + "step": 14567 + }, + { + "epoch": 1.8531993385065513, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6996403932571411, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8846951723098755, + "num_tokens": 555953790.0, + "step": 14568 + }, + { + "epoch": 1.8533265487851418, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5541249513626099, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.880518913269043, + "num_tokens": 555991686.0, + "step": 14569 + }, + { + "epoch": 1.8534537590637323, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5928261280059814, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8660815954208374, + "num_tokens": 556031138.0, + "step": 14570 + }, + { + "epoch": 1.8535809693423229, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7291840314865112, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8607800602912903, + "num_tokens": 556068567.0, + "step": 14571 + }, + { + "epoch": 1.8537081796209134, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5634112358093262, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8725790977478027, + "num_tokens": 556107014.0, + "step": 14572 + }, + { + "epoch": 1.853835389899504, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5173420906066895, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.874932050704956, + "num_tokens": 556149021.0, + "step": 14573 + }, + { + "epoch": 1.8539626001780944, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7105555534362793, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8698108196258545, + "num_tokens": 556185515.0, + "step": 14574 + }, + { + "epoch": 1.854089810456685, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5461070537567139, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8602942228317261, + "num_tokens": 556225749.0, + "step": 14575 + }, + { + "epoch": 1.8542170207352755, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6266788244247437, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8747811913490295, + "num_tokens": 556261989.0, + "step": 14576 + }, + { + "epoch": 1.854344231013866, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7665653228759766, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8742070198059082, + "num_tokens": 556294044.0, + "step": 14577 + }, + { + "epoch": 1.8544714412924566, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.600451946258545, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8585397005081177, + "num_tokens": 556333068.0, + "step": 14578 + }, + { + "epoch": 1.854598651571047, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5720643997192383, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8818667531013489, + "num_tokens": 556371297.0, + "step": 14579 + }, + { + "epoch": 1.8547258618496376, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6792867183685303, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8531252145767212, + "num_tokens": 556411770.0, + "step": 14580 + }, + { + "epoch": 1.854853072128228, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6651921272277832, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8678837418556213, + "num_tokens": 556447823.0, + "step": 14581 + }, + { + "epoch": 1.8549802824068184, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.480069637298584, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8730372786521912, + "num_tokens": 556490369.0, + "step": 14582 + }, + { + "epoch": 1.855107492685409, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6440975666046143, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8877294063568115, + "num_tokens": 556523640.0, + "step": 14583 + }, + { + "epoch": 1.8552347029639995, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.596320629119873, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8615331649780273, + "num_tokens": 556565185.0, + "step": 14584 + }, + { + "epoch": 1.85536191324259, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7116740942001343, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8788973093032837, + "num_tokens": 556596951.0, + "step": 14585 + }, + { + "epoch": 1.8554891235211806, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.523127555847168, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8626896739006042, + "num_tokens": 556634992.0, + "step": 14586 + }, + { + "epoch": 1.8556163337997709, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5674649477005005, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8679436445236206, + "num_tokens": 556678762.0, + "step": 14587 + }, + { + "epoch": 1.8557435440783614, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.508733868598938, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.874964714050293, + "num_tokens": 556717470.0, + "step": 14588 + }, + { + "epoch": 1.855870754356952, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6400532722473145, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8765528798103333, + "num_tokens": 556754391.0, + "step": 14589 + }, + { + "epoch": 1.8559979646355425, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5700690746307373, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8710551261901855, + "num_tokens": 556793696.0, + "step": 14590 + }, + { + "epoch": 1.856125174914133, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5965299606323242, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8593592047691345, + "num_tokens": 556834335.0, + "step": 14591 + }, + { + "epoch": 1.8562523851927235, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5858443975448608, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8796536326408386, + "num_tokens": 556870886.0, + "step": 14592 + }, + { + "epoch": 1.856379595471314, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.591220736503601, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8679660558700562, + "num_tokens": 556910590.0, + "step": 14593 + }, + { + "epoch": 1.8565068057499046, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7024677991867065, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8715617656707764, + "num_tokens": 556944862.0, + "step": 14594 + }, + { + "epoch": 1.856634016028495, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.710595726966858, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8670748472213745, + "num_tokens": 556980805.0, + "step": 14595 + }, + { + "epoch": 1.8567612263070856, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7178642749786377, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8602360486984253, + "num_tokens": 557019832.0, + "step": 14596 + }, + { + "epoch": 1.8568884365856761, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6944453716278076, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.845899224281311, + "num_tokens": 557056408.0, + "step": 14597 + }, + { + "epoch": 1.8570156468642667, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.784214735031128, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8617254495620728, + "num_tokens": 557091244.0, + "step": 14598 + }, + { + "epoch": 1.8571428571428572, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6076395511627197, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.874449610710144, + "num_tokens": 557126131.0, + "step": 14599 + }, + { + "epoch": 1.8572700674214477, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6207538843154907, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.863513708114624, + "num_tokens": 557167270.0, + "step": 14600 + }, + { + "epoch": 1.8573972777000383, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5027496814727783, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8743923902511597, + "num_tokens": 557208228.0, + "step": 14601 + }, + { + "epoch": 1.8575244879786288, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.7098779678344727, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8716742992401123, + "num_tokens": 557247384.0, + "step": 14602 + }, + { + "epoch": 1.8576516982572193, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.8305152654647827, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8640943765640259, + "num_tokens": 557281648.0, + "step": 14603 + }, + { + "epoch": 1.8577789085358098, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6821235418319702, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8538234233856201, + "num_tokens": 557321977.0, + "step": 14604 + }, + { + "epoch": 1.8579061188144002, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.5006306171417236, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8533121347427368, + "num_tokens": 557368821.0, + "step": 14605 + }, + { + "epoch": 1.8580333290929907, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.579037070274353, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8680670857429504, + "num_tokens": 557412082.0, + "step": 14606 + }, + { + "epoch": 1.8581605393715812, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7778990268707275, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8718456029891968, + "num_tokens": 557450613.0, + "step": 14607 + }, + { + "epoch": 1.8582877496501717, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5517475605010986, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.874957263469696, + "num_tokens": 557492196.0, + "step": 14608 + }, + { + "epoch": 1.8584149599287623, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5859670639038086, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8799222111701965, + "num_tokens": 557526917.0, + "step": 14609 + }, + { + "epoch": 1.8585421702073528, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.8300745487213135, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8689770698547363, + "num_tokens": 557560420.0, + "step": 14610 + }, + { + "epoch": 1.858669380485943, + "ewc_loss": 2.47955322265625e-05, + "grad_norm": 1.6088284254074097, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8747460246086121, + "num_tokens": 557597129.0, + "step": 14611 + }, + { + "epoch": 1.8587965907645336, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5848650932312012, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8759233951568604, + "num_tokens": 557639037.0, + "step": 14612 + }, + { + "epoch": 1.8589238010431242, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6521049737930298, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.887628436088562, + "num_tokens": 557678114.0, + "step": 14613 + }, + { + "epoch": 1.8590510113217147, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5851606130599976, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8691924810409546, + "num_tokens": 557716470.0, + "step": 14614 + }, + { + "epoch": 1.8591782216003052, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7718290090560913, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8698586225509644, + "num_tokens": 557750589.0, + "step": 14615 + }, + { + "epoch": 1.8593054318788957, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5529695749282837, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8871920704841614, + "num_tokens": 557785664.0, + "step": 14616 + }, + { + "epoch": 1.8594326421574863, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6415338516235352, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8842298984527588, + "num_tokens": 557818988.0, + "step": 14617 + }, + { + "epoch": 1.8595598524360768, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5897173881530762, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8647890090942383, + "num_tokens": 557857444.0, + "step": 14618 + }, + { + "epoch": 1.8596870627146673, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.565730333328247, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8681786060333252, + "num_tokens": 557897769.0, + "step": 14619 + }, + { + "epoch": 1.8598142729932579, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5261762142181396, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.876736044883728, + "num_tokens": 557936145.0, + "step": 14620 + }, + { + "epoch": 1.8599414832718484, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4905914068222046, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8826025724411011, + "num_tokens": 557972972.0, + "step": 14621 + }, + { + "epoch": 1.860068693550439, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6325695514678955, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8619009852409363, + "num_tokens": 558014923.0, + "step": 14622 + }, + { + "epoch": 1.8601959038290294, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6079658269882202, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8832266330718994, + "num_tokens": 558049277.0, + "step": 14623 + }, + { + "epoch": 1.86032311410762, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7436251640319824, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8619775772094727, + "num_tokens": 558080491.0, + "step": 14624 + }, + { + "epoch": 1.8604503243862105, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6080408096313477, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8774330615997314, + "num_tokens": 558118792.0, + "step": 14625 + }, + { + "epoch": 1.860577534664801, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6174463033676147, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8591564893722534, + "num_tokens": 558163002.0, + "step": 14626 + }, + { + "epoch": 1.8607047449433916, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4558939933776855, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.892265260219574, + "num_tokens": 558203548.0, + "step": 14627 + }, + { + "epoch": 1.860831955221982, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5849851369857788, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8766523003578186, + "num_tokens": 558237866.0, + "step": 14628 + }, + { + "epoch": 1.8609591655005726, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6779955625534058, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8760315179824829, + "num_tokens": 558276916.0, + "step": 14629 + }, + { + "epoch": 1.861086375779163, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4799920320510864, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8910263776779175, + "num_tokens": 558312887.0, + "step": 14630 + }, + { + "epoch": 1.8612135860577534, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.581438422203064, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8687818050384521, + "num_tokens": 558353407.0, + "step": 14631 + }, + { + "epoch": 1.861340796336344, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.534738540649414, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8783174753189087, + "num_tokens": 558394434.0, + "step": 14632 + }, + { + "epoch": 1.8614680066149345, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.639535903930664, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8838653564453125, + "num_tokens": 558427859.0, + "step": 14633 + }, + { + "epoch": 1.861595216893525, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5051405429840088, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8729850053787231, + "num_tokens": 558468891.0, + "step": 14634 + }, + { + "epoch": 1.8617224271721156, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5624926090240479, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8751689195632935, + "num_tokens": 558506784.0, + "step": 14635 + }, + { + "epoch": 1.8618496374507059, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6782217025756836, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8711259365081787, + "num_tokens": 558545541.0, + "step": 14636 + }, + { + "epoch": 1.8619768477292964, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.479750394821167, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8695372343063354, + "num_tokens": 558588704.0, + "step": 14637 + }, + { + "epoch": 1.862104058007887, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5035922527313232, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8839580416679382, + "num_tokens": 558628412.0, + "step": 14638 + }, + { + "epoch": 1.8622312682864774, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.534519076347351, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8728300333023071, + "num_tokens": 558671315.0, + "step": 14639 + }, + { + "epoch": 1.862358478565068, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.531950831413269, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8810180425643921, + "num_tokens": 558710544.0, + "step": 14640 + }, + { + "epoch": 1.8624856888436585, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6516989469528198, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8625922203063965, + "num_tokens": 558751269.0, + "step": 14641 + }, + { + "epoch": 1.862612899122249, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.512059211730957, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8903611898422241, + "num_tokens": 558786527.0, + "step": 14642 + }, + { + "epoch": 1.8627401094008396, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5470300912857056, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8774726390838623, + "num_tokens": 558824457.0, + "step": 14643 + }, + { + "epoch": 1.86286731967943, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7782162427902222, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8840356469154358, + "num_tokens": 558853200.0, + "step": 14644 + }, + { + "epoch": 1.8629945299580206, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6953845024108887, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8527179956436157, + "num_tokens": 558894392.0, + "step": 14645 + }, + { + "epoch": 1.8631217402366111, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4929076433181763, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8666814565658569, + "num_tokens": 558939826.0, + "step": 14646 + }, + { + "epoch": 1.8632489505152017, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5698448419570923, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.883469820022583, + "num_tokens": 558975872.0, + "step": 14647 + }, + { + "epoch": 1.8633761607937922, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.553135871887207, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8760747313499451, + "num_tokens": 559018984.0, + "step": 14648 + }, + { + "epoch": 1.8635033710723827, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5994133949279785, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8654952049255371, + "num_tokens": 559055516.0, + "step": 14649 + }, + { + "epoch": 1.8636305813509733, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.711089849472046, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8742985725402832, + "num_tokens": 559088386.0, + "step": 14650 + }, + { + "epoch": 1.8637577916295638, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.533303141593933, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8702698945999146, + "num_tokens": 559129019.0, + "step": 14651 + }, + { + "epoch": 1.8638850019081543, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6515939235687256, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8631699681282043, + "num_tokens": 559164360.0, + "step": 14652 + }, + { + "epoch": 1.8640122121867448, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4926836490631104, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8820139169692993, + "num_tokens": 559203853.0, + "step": 14653 + }, + { + "epoch": 1.8641394224653351, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5092140436172485, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8752027153968811, + "num_tokens": 559244375.0, + "step": 14654 + }, + { + "epoch": 1.8642666327439257, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.619210958480835, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.870986819267273, + "num_tokens": 559280113.0, + "step": 14655 + }, + { + "epoch": 1.8643938430225162, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5128264427185059, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8720823526382446, + "num_tokens": 559322063.0, + "step": 14656 + }, + { + "epoch": 1.8645210533011067, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6080763339996338, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8753583431243896, + "num_tokens": 559357498.0, + "step": 14657 + }, + { + "epoch": 1.8646482635796973, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7567410469055176, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8570410013198853, + "num_tokens": 559392059.0, + "step": 14658 + }, + { + "epoch": 1.8647754738582878, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6130266189575195, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8623429536819458, + "num_tokens": 559431168.0, + "step": 14659 + }, + { + "epoch": 1.864902684136878, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6320165395736694, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8670566082000732, + "num_tokens": 559465790.0, + "step": 14660 + }, + { + "epoch": 1.8650298944154686, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6008764505386353, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8786053657531738, + "num_tokens": 559503671.0, + "step": 14661 + }, + { + "epoch": 1.8651571046940592, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6099156141281128, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8716350793838501, + "num_tokens": 559538881.0, + "step": 14662 + }, + { + "epoch": 1.8652843149726497, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5871232748031616, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8689751625061035, + "num_tokens": 559584713.0, + "step": 14663 + }, + { + "epoch": 1.8654115252512402, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6646466255187988, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8785730600357056, + "num_tokens": 559617477.0, + "step": 14664 + }, + { + "epoch": 1.8655387355298307, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.58953857421875, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8809369802474976, + "num_tokens": 559651453.0, + "step": 14665 + }, + { + "epoch": 1.8656659458084213, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5296083688735962, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.882360577583313, + "num_tokens": 559687211.0, + "step": 14666 + }, + { + "epoch": 1.8657931560870118, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 2.597641706466675, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8681646585464478, + "num_tokens": 559720711.0, + "step": 14667 + }, + { + "epoch": 1.8659203663656023, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.707570195198059, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8605493307113647, + "num_tokens": 559755992.0, + "step": 14668 + }, + { + "epoch": 1.8660475766441929, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4660714864730835, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8841381669044495, + "num_tokens": 559798442.0, + "step": 14669 + }, + { + "epoch": 1.8661747869227834, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5999761819839478, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8671813607215881, + "num_tokens": 559834660.0, + "step": 14670 + }, + { + "epoch": 1.866301997201374, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7121468782424927, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8692419528961182, + "num_tokens": 559866319.0, + "step": 14671 + }, + { + "epoch": 1.8664292074799644, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6872339248657227, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8841909170150757, + "num_tokens": 559897920.0, + "step": 14672 + }, + { + "epoch": 1.866556417758555, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5309773683547974, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8688239455223083, + "num_tokens": 559941856.0, + "step": 14673 + }, + { + "epoch": 1.8666836280371455, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 3.6531834602355957, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8668850064277649, + "num_tokens": 559983746.0, + "step": 14674 + }, + { + "epoch": 1.866810838315736, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5703471899032593, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8618412017822266, + "num_tokens": 560025131.0, + "step": 14675 + }, + { + "epoch": 1.8669380485943265, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.514585256576538, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8825673460960388, + "num_tokens": 560067643.0, + "step": 14676 + }, + { + "epoch": 1.867065258872917, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7429344654083252, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8751690983772278, + "num_tokens": 560099116.0, + "step": 14677 + }, + { + "epoch": 1.8671924691515076, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.774599313735962, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8691967129707336, + "num_tokens": 560131088.0, + "step": 14678 + }, + { + "epoch": 1.867319679430098, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4863591194152832, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8808845281600952, + "num_tokens": 560171931.0, + "step": 14679 + }, + { + "epoch": 1.8674468897086884, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7242910861968994, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8546243906021118, + "num_tokens": 560206031.0, + "step": 14680 + }, + { + "epoch": 1.867574099987279, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.726378321647644, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8682419061660767, + "num_tokens": 560238781.0, + "step": 14681 + }, + { + "epoch": 1.8677013102658695, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5629111528396606, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8838657140731812, + "num_tokens": 560277519.0, + "step": 14682 + }, + { + "epoch": 1.86782852054446, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6857401132583618, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8586143851280212, + "num_tokens": 560313892.0, + "step": 14683 + }, + { + "epoch": 1.8679557308230506, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.650868535041809, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8715992569923401, + "num_tokens": 560348997.0, + "step": 14684 + }, + { + "epoch": 1.8680829411016409, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.463415265083313, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8879886269569397, + "num_tokens": 560390490.0, + "step": 14685 + }, + { + "epoch": 1.8682101513802314, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4741793870925903, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8765782117843628, + "num_tokens": 560432206.0, + "step": 14686 + }, + { + "epoch": 1.868337361658822, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.608232021331787, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8575994968414307, + "num_tokens": 560472270.0, + "step": 14687 + }, + { + "epoch": 1.8684645719374124, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8838928937911987, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8624380230903625, + "num_tokens": 560504828.0, + "step": 14688 + }, + { + "epoch": 1.868591782216003, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5775700807571411, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8673426508903503, + "num_tokens": 560544410.0, + "step": 14689 + }, + { + "epoch": 1.8687189924945935, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5401805639266968, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8790065050125122, + "num_tokens": 560584769.0, + "step": 14690 + }, + { + "epoch": 1.868846202773184, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5371414422988892, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8654758334159851, + "num_tokens": 560626186.0, + "step": 14691 + }, + { + "epoch": 1.8689734130517746, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8573287725448608, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8676999807357788, + "num_tokens": 560654728.0, + "step": 14692 + }, + { + "epoch": 1.869100623330365, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4023581743240356, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8858656287193298, + "num_tokens": 560698739.0, + "step": 14693 + }, + { + "epoch": 1.8692278336089556, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7517625093460083, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8789424896240234, + "num_tokens": 560730075.0, + "step": 14694 + }, + { + "epoch": 1.8693550438875461, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6385499238967896, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.863967776298523, + "num_tokens": 560769690.0, + "step": 14695 + }, + { + "epoch": 1.8694822541661367, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5256885290145874, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8605705499649048, + "num_tokens": 560817663.0, + "step": 14696 + }, + { + "epoch": 1.8696094644447272, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6673235893249512, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8711925148963928, + "num_tokens": 560853649.0, + "step": 14697 + }, + { + "epoch": 1.8697366747233177, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5529338121414185, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8750686645507812, + "num_tokens": 560896197.0, + "step": 14698 + }, + { + "epoch": 1.8698638850019083, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7356278896331787, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8600355982780457, + "num_tokens": 560930535.0, + "step": 14699 + }, + { + "epoch": 1.8699910952804988, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8269097805023193, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8604728579521179, + "num_tokens": 560965027.0, + "step": 14700 + }, + { + "epoch": 1.8701183055590893, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.51731276512146, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8796002864837646, + "num_tokens": 561004254.0, + "step": 14701 + }, + { + "epoch": 1.8702455158376798, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7920832633972168, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.864627480506897, + "num_tokens": 561043400.0, + "step": 14702 + }, + { + "epoch": 1.8703727261162701, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5930845737457275, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.875148355960846, + "num_tokens": 561081900.0, + "step": 14703 + }, + { + "epoch": 1.8704999363948607, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5149871110916138, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8684020042419434, + "num_tokens": 561120966.0, + "step": 14704 + }, + { + "epoch": 1.8706271466734512, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4262669086456299, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8740440607070923, + "num_tokens": 561163612.0, + "step": 14705 + }, + { + "epoch": 1.8707543569520417, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5336617231369019, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8794521689414978, + "num_tokens": 561201425.0, + "step": 14706 + }, + { + "epoch": 1.8708815672306323, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.597478985786438, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8494571447372437, + "num_tokens": 561242280.0, + "step": 14707 + }, + { + "epoch": 1.8710087775092228, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8406516313552856, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8759717345237732, + "num_tokens": 561270013.0, + "step": 14708 + }, + { + "epoch": 1.871135987787813, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5865670442581177, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8831424713134766, + "num_tokens": 561303090.0, + "step": 14709 + }, + { + "epoch": 1.8712631980664036, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.695541501045227, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8677999973297119, + "num_tokens": 561340870.0, + "step": 14710 + }, + { + "epoch": 1.8713904083449941, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5649261474609375, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.871037483215332, + "num_tokens": 561381802.0, + "step": 14711 + }, + { + "epoch": 1.8715176186235847, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4991806745529175, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.874163031578064, + "num_tokens": 561422606.0, + "step": 14712 + }, + { + "epoch": 1.8716448289021752, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5667188167572021, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8736409544944763, + "num_tokens": 561463822.0, + "step": 14713 + }, + { + "epoch": 1.8717720391807657, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5535354614257812, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8828144073486328, + "num_tokens": 561504267.0, + "step": 14714 + }, + { + "epoch": 1.8718992494593563, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5612757205963135, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8735132217407227, + "num_tokens": 561539592.0, + "step": 14715 + }, + { + "epoch": 1.8720264597379468, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5400686264038086, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8874831199645996, + "num_tokens": 561574649.0, + "step": 14716 + }, + { + "epoch": 1.8721536700165373, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6548835039138794, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8558813333511353, + "num_tokens": 561614160.0, + "step": 14717 + }, + { + "epoch": 1.8722808802951278, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6858493089675903, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8603828549385071, + "num_tokens": 561651714.0, + "step": 14718 + }, + { + "epoch": 1.8724080905737184, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7728182077407837, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8744366765022278, + "num_tokens": 561691766.0, + "step": 14719 + }, + { + "epoch": 1.872535300852309, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6167391538619995, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.867464542388916, + "num_tokens": 561732779.0, + "step": 14720 + }, + { + "epoch": 1.8726625111308994, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5965169668197632, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8750152587890625, + "num_tokens": 561773826.0, + "step": 14721 + }, + { + "epoch": 1.87278972140949, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6307077407836914, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8728533983230591, + "num_tokens": 561813026.0, + "step": 14722 + }, + { + "epoch": 1.8729169316880805, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5335057973861694, + "learning_rate": 1e-06, + "loss": 0.3378, + "mean_token_accuracy": 0.8885241746902466, + "num_tokens": 561848881.0, + "step": 14723 + }, + { + "epoch": 1.873044141966671, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6246744394302368, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8652900457382202, + "num_tokens": 561892308.0, + "step": 14724 + }, + { + "epoch": 1.8731713522452615, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6250085830688477, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8699777722358704, + "num_tokens": 561928468.0, + "step": 14725 + }, + { + "epoch": 1.873298562523852, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7737715244293213, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8561445474624634, + "num_tokens": 561962733.0, + "step": 14726 + }, + { + "epoch": 1.8734257728024426, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 3.685823917388916, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8678270578384399, + "num_tokens": 562003185.0, + "step": 14727 + }, + { + "epoch": 1.873552983081033, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.600615382194519, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8630080223083496, + "num_tokens": 562040525.0, + "step": 14728 + }, + { + "epoch": 1.8736801933596234, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6800044775009155, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.869353711605072, + "num_tokens": 562077342.0, + "step": 14729 + }, + { + "epoch": 1.873807403638214, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.624573826789856, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.858321487903595, + "num_tokens": 562116629.0, + "step": 14730 + }, + { + "epoch": 1.8739346139168045, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4244507551193237, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8771212100982666, + "num_tokens": 562160990.0, + "step": 14731 + }, + { + "epoch": 1.874061824195395, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4183988571166992, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8792688250541687, + "num_tokens": 562204955.0, + "step": 14732 + }, + { + "epoch": 1.8741890344739855, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5790228843688965, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8731889128684998, + "num_tokens": 562239739.0, + "step": 14733 + }, + { + "epoch": 1.8743162447525759, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6811482906341553, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8630116581916809, + "num_tokens": 562274536.0, + "step": 14734 + }, + { + "epoch": 1.8744434550311664, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.584963083267212, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8749827146530151, + "num_tokens": 562310522.0, + "step": 14735 + }, + { + "epoch": 1.874570665309757, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6998716592788696, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.856940507888794, + "num_tokens": 562347436.0, + "step": 14736 + }, + { + "epoch": 1.8746978755883474, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5983749628067017, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8871650695800781, + "num_tokens": 562385628.0, + "step": 14737 + }, + { + "epoch": 1.874825085866938, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5031951665878296, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8680840730667114, + "num_tokens": 562427410.0, + "step": 14738 + }, + { + "epoch": 1.8749522961455285, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5896323919296265, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8645862340927124, + "num_tokens": 562469080.0, + "step": 14739 + }, + { + "epoch": 1.875079506424119, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6404043436050415, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8640148639678955, + "num_tokens": 562509344.0, + "step": 14740 + }, + { + "epoch": 1.8752067167027096, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6346774101257324, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8713977336883545, + "num_tokens": 562543364.0, + "step": 14741 + }, + { + "epoch": 1.8753339269813, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6371203660964966, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.86768639087677, + "num_tokens": 562584635.0, + "step": 14742 + }, + { + "epoch": 1.8754611372598906, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.723575234413147, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.877220869064331, + "num_tokens": 562618918.0, + "step": 14743 + }, + { + "epoch": 1.8755883475384811, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4436616897583008, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8704627752304077, + "num_tokens": 562662400.0, + "step": 14744 + }, + { + "epoch": 1.8757155578170717, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6005669832229614, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8851256370544434, + "num_tokens": 562699634.0, + "step": 14745 + }, + { + "epoch": 1.8758427680956622, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6785888671875, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8661704063415527, + "num_tokens": 562733078.0, + "step": 14746 + }, + { + "epoch": 1.8759699783742527, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5706918239593506, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8679642081260681, + "num_tokens": 562775031.0, + "step": 14747 + }, + { + "epoch": 1.8760971886528433, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.642215609550476, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8754031658172607, + "num_tokens": 562811950.0, + "step": 14748 + }, + { + "epoch": 1.8762243989314338, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.531219482421875, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8825904726982117, + "num_tokens": 562852938.0, + "step": 14749 + }, + { + "epoch": 1.8763516092100243, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.593021273612976, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8867350816726685, + "num_tokens": 562890248.0, + "step": 14750 + }, + { + "epoch": 1.8764788194886148, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6184815168380737, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8653588891029358, + "num_tokens": 562929404.0, + "step": 14751 + }, + { + "epoch": 1.8766060297672051, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7083007097244263, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8718388676643372, + "num_tokens": 562964735.0, + "step": 14752 + }, + { + "epoch": 1.8767332400457957, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6009372472763062, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8801629543304443, + "num_tokens": 562998836.0, + "step": 14753 + }, + { + "epoch": 1.8768604503243862, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.600301742553711, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8782064914703369, + "num_tokens": 563035947.0, + "step": 14754 + }, + { + "epoch": 1.8769876606029767, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5357167720794678, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8815455436706543, + "num_tokens": 563075069.0, + "step": 14755 + }, + { + "epoch": 1.8771148708815673, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.62354576587677, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8800469636917114, + "num_tokens": 563112237.0, + "step": 14756 + }, + { + "epoch": 1.8772420811601578, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.534778118133545, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8712130784988403, + "num_tokens": 563153398.0, + "step": 14757 + }, + { + "epoch": 1.877369291438748, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6332324743270874, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8757157325744629, + "num_tokens": 563191363.0, + "step": 14758 + }, + { + "epoch": 1.8774965017173386, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6292120218276978, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8677310347557068, + "num_tokens": 563230530.0, + "step": 14759 + }, + { + "epoch": 1.8776237119959291, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5294759273529053, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8822810053825378, + "num_tokens": 563269020.0, + "step": 14760 + }, + { + "epoch": 1.8777509222745197, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6116362810134888, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8793538808822632, + "num_tokens": 563308095.0, + "step": 14761 + }, + { + "epoch": 1.8778781325531102, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.61528480052948, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8707326650619507, + "num_tokens": 563345184.0, + "step": 14762 + }, + { + "epoch": 1.8780053428317007, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7032630443572998, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8653008341789246, + "num_tokens": 563379331.0, + "step": 14763 + }, + { + "epoch": 1.8781325531102913, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7890392541885376, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8618344068527222, + "num_tokens": 563413825.0, + "step": 14764 + }, + { + "epoch": 1.8782597633888818, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6640288829803467, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8791036605834961, + "num_tokens": 563452340.0, + "step": 14765 + }, + { + "epoch": 1.8783869736674723, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.761252760887146, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8634123802185059, + "num_tokens": 563488386.0, + "step": 14766 + }, + { + "epoch": 1.8785141839460628, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6767581701278687, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8670085668563843, + "num_tokens": 563526431.0, + "step": 14767 + }, + { + "epoch": 1.8786413942246534, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6224391460418701, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8807392120361328, + "num_tokens": 563561490.0, + "step": 14768 + }, + { + "epoch": 1.878768604503244, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6095991134643555, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.888333797454834, + "num_tokens": 563599411.0, + "step": 14769 + }, + { + "epoch": 1.8788958147818344, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7220382690429688, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8666146397590637, + "num_tokens": 563642420.0, + "step": 14770 + }, + { + "epoch": 1.879023025060425, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6766027212142944, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8527953624725342, + "num_tokens": 563682895.0, + "step": 14771 + }, + { + "epoch": 1.8791502353390155, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7542561292648315, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.85634845495224, + "num_tokens": 563716699.0, + "step": 14772 + }, + { + "epoch": 1.879277445617606, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8654216527938843, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8762994408607483, + "num_tokens": 563748304.0, + "step": 14773 + }, + { + "epoch": 1.8794046558961965, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4854545593261719, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8705086708068848, + "num_tokens": 563791385.0, + "step": 14774 + }, + { + "epoch": 1.879531866174787, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5569027662277222, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8740918040275574, + "num_tokens": 563829816.0, + "step": 14775 + }, + { + "epoch": 1.8796590764533776, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6225591897964478, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8644782304763794, + "num_tokens": 563865274.0, + "step": 14776 + }, + { + "epoch": 1.879786286731968, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5345653295516968, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8857811689376831, + "num_tokens": 563906087.0, + "step": 14777 + }, + { + "epoch": 1.8799134970105584, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.639859914779663, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8676731586456299, + "num_tokens": 563943683.0, + "step": 14778 + }, + { + "epoch": 1.880040707289149, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5452426671981812, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.878020167350769, + "num_tokens": 563980794.0, + "step": 14779 + }, + { + "epoch": 1.8801679175677395, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5304113626480103, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8757951259613037, + "num_tokens": 564017944.0, + "step": 14780 + }, + { + "epoch": 1.88029512784633, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.460292935371399, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8762922286987305, + "num_tokens": 564061023.0, + "step": 14781 + }, + { + "epoch": 1.8804223381249205, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6525046825408936, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8779348731040955, + "num_tokens": 564099271.0, + "step": 14782 + }, + { + "epoch": 1.8805495484035109, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6139827966690063, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8602585792541504, + "num_tokens": 564135556.0, + "step": 14783 + }, + { + "epoch": 1.8806767586821014, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5669631958007812, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.863993227481842, + "num_tokens": 564174479.0, + "step": 14784 + }, + { + "epoch": 1.880803968960692, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5677233934402466, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8749889135360718, + "num_tokens": 564214338.0, + "step": 14785 + }, + { + "epoch": 1.8809311792392824, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5472928285598755, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.883692741394043, + "num_tokens": 564250313.0, + "step": 14786 + }, + { + "epoch": 1.881058389517873, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7464892864227295, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8753397464752197, + "num_tokens": 564283590.0, + "step": 14787 + }, + { + "epoch": 1.8811855997964635, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4702210426330566, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8714020252227783, + "num_tokens": 564327423.0, + "step": 14788 + }, + { + "epoch": 1.881312810075054, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7289506196975708, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8770511746406555, + "num_tokens": 564359295.0, + "step": 14789 + }, + { + "epoch": 1.8814400203536445, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5593234300613403, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8580033779144287, + "num_tokens": 564400795.0, + "step": 14790 + }, + { + "epoch": 1.881567230632235, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7392700910568237, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8562500476837158, + "num_tokens": 564435194.0, + "step": 14791 + }, + { + "epoch": 1.8816944409108256, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5609670877456665, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.879340648651123, + "num_tokens": 564472294.0, + "step": 14792 + }, + { + "epoch": 1.8818216511894161, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6317241191864014, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8626371622085571, + "num_tokens": 564509991.0, + "step": 14793 + }, + { + "epoch": 1.8819488614680067, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6915667057037354, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.866950511932373, + "num_tokens": 564546996.0, + "step": 14794 + }, + { + "epoch": 1.8820760717465972, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7885278463363647, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8649455308914185, + "num_tokens": 564583019.0, + "step": 14795 + }, + { + "epoch": 1.8822032820251877, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6366932392120361, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8722584247589111, + "num_tokens": 564620960.0, + "step": 14796 + }, + { + "epoch": 1.8823304923037782, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5427861213684082, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8663968443870544, + "num_tokens": 564661029.0, + "step": 14797 + }, + { + "epoch": 1.8824577025823688, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.541615605354309, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8658560514450073, + "num_tokens": 564704527.0, + "step": 14798 + }, + { + "epoch": 1.8825849128609593, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7271395921707153, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8625049591064453, + "num_tokens": 564738792.0, + "step": 14799 + }, + { + "epoch": 1.8827121231395498, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5300217866897583, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8797461986541748, + "num_tokens": 564777428.0, + "step": 14800 + }, + { + "epoch": 1.8828393334181401, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.707003116607666, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8702859878540039, + "num_tokens": 564810740.0, + "step": 14801 + }, + { + "epoch": 1.8829665436967307, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6678051948547363, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8807135224342346, + "num_tokens": 564846051.0, + "step": 14802 + }, + { + "epoch": 1.8830937539753212, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.527180552482605, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.866127610206604, + "num_tokens": 564890312.0, + "step": 14803 + }, + { + "epoch": 1.8832209642539117, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.602688193321228, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8583242297172546, + "num_tokens": 564930993.0, + "step": 14804 + }, + { + "epoch": 1.8833481745325023, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5597515106201172, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8790743350982666, + "num_tokens": 564968628.0, + "step": 14805 + }, + { + "epoch": 1.8834753848110928, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6886454820632935, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8741043210029602, + "num_tokens": 565006628.0, + "step": 14806 + }, + { + "epoch": 1.883602595089683, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5430470705032349, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8679361939430237, + "num_tokens": 565047903.0, + "step": 14807 + }, + { + "epoch": 1.8837298053682736, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5452721118927002, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8799514770507812, + "num_tokens": 565085799.0, + "step": 14808 + }, + { + "epoch": 1.8838570156468641, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7315573692321777, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8554297685623169, + "num_tokens": 565122388.0, + "step": 14809 + }, + { + "epoch": 1.8839842259254547, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6213613748550415, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8617455363273621, + "num_tokens": 565159718.0, + "step": 14810 + }, + { + "epoch": 1.8841114362040452, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5214701890945435, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.875454306602478, + "num_tokens": 565196123.0, + "step": 14811 + }, + { + "epoch": 1.8842386464826357, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5556851625442505, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8715526461601257, + "num_tokens": 565238171.0, + "step": 14812 + }, + { + "epoch": 1.8843658567612263, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6272703409194946, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8832858204841614, + "num_tokens": 565276004.0, + "step": 14813 + }, + { + "epoch": 1.8844930670398168, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5349308252334595, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8704181909561157, + "num_tokens": 565315400.0, + "step": 14814 + }, + { + "epoch": 1.8846202773184073, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.752986192703247, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8625108003616333, + "num_tokens": 565350590.0, + "step": 14815 + }, + { + "epoch": 1.8847474875969978, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5530184507369995, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8752148151397705, + "num_tokens": 565390708.0, + "step": 14816 + }, + { + "epoch": 1.8848746978755884, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5592268705368042, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8716229200363159, + "num_tokens": 565425704.0, + "step": 14817 + }, + { + "epoch": 1.885001908154179, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5607887506484985, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8772556185722351, + "num_tokens": 565466038.0, + "step": 14818 + }, + { + "epoch": 1.8851291184327694, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5933479070663452, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8704953789710999, + "num_tokens": 565505142.0, + "step": 14819 + }, + { + "epoch": 1.88525632871136, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.664794921875, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8692351579666138, + "num_tokens": 565542192.0, + "step": 14820 + }, + { + "epoch": 1.8853835389899505, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.640439748764038, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8711801171302795, + "num_tokens": 565580245.0, + "step": 14821 + }, + { + "epoch": 1.885510749268541, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.622205138206482, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8782728910446167, + "num_tokens": 565616891.0, + "step": 14822 + }, + { + "epoch": 1.8856379595471315, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6977903842926025, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8644803762435913, + "num_tokens": 565653168.0, + "step": 14823 + }, + { + "epoch": 1.885765169825722, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5884284973144531, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8900243639945984, + "num_tokens": 565685651.0, + "step": 14824 + }, + { + "epoch": 1.8858923801043126, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7289915084838867, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.872854471206665, + "num_tokens": 565723171.0, + "step": 14825 + }, + { + "epoch": 1.886019590382903, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7705588340759277, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8585787415504456, + "num_tokens": 565761105.0, + "step": 14826 + }, + { + "epoch": 1.8861468006614934, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7036828994750977, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8704121112823486, + "num_tokens": 565794292.0, + "step": 14827 + }, + { + "epoch": 1.886274010940084, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6833927631378174, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8677107095718384, + "num_tokens": 565828324.0, + "step": 14828 + }, + { + "epoch": 1.8864012212186745, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6230179071426392, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8664638996124268, + "num_tokens": 565869386.0, + "step": 14829 + }, + { + "epoch": 1.886528431497265, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6208573579788208, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8782249689102173, + "num_tokens": 565903825.0, + "step": 14830 + }, + { + "epoch": 1.8866556417758555, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.666019082069397, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8722449541091919, + "num_tokens": 565939294.0, + "step": 14831 + }, + { + "epoch": 1.8867828520544458, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4816083908081055, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8752433061599731, + "num_tokens": 565982813.0, + "step": 14832 + }, + { + "epoch": 1.8869100623330364, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6570539474487305, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8619165420532227, + "num_tokens": 566020671.0, + "step": 14833 + }, + { + "epoch": 1.887037272611627, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5318186283111572, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.881563663482666, + "num_tokens": 566059426.0, + "step": 14834 + }, + { + "epoch": 1.8871644828902174, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5740203857421875, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8580869436264038, + "num_tokens": 566098183.0, + "step": 14835 + }, + { + "epoch": 1.887291693168808, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.580571174621582, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.873528778553009, + "num_tokens": 566138338.0, + "step": 14836 + }, + { + "epoch": 1.8874189034473985, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.539751410484314, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8677008152008057, + "num_tokens": 566176160.0, + "step": 14837 + }, + { + "epoch": 1.887546113725989, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.680413842201233, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8815401792526245, + "num_tokens": 566214610.0, + "step": 14838 + }, + { + "epoch": 1.8876733240045795, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6992404460906982, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8577154874801636, + "num_tokens": 566248382.0, + "step": 14839 + }, + { + "epoch": 1.88780053428317, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5477666854858398, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8649571537971497, + "num_tokens": 566287548.0, + "step": 14840 + }, + { + "epoch": 1.8879277445617606, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6943303346633911, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8547303080558777, + "num_tokens": 566325405.0, + "step": 14841 + }, + { + "epoch": 1.8880549548403511, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7867037057876587, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8705031871795654, + "num_tokens": 566354592.0, + "step": 14842 + }, + { + "epoch": 1.8881821651189417, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5836665630340576, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8584378957748413, + "num_tokens": 566403266.0, + "step": 14843 + }, + { + "epoch": 1.8883093753975322, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6821008920669556, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8620843887329102, + "num_tokens": 566444075.0, + "step": 14844 + }, + { + "epoch": 1.8884365856761227, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5716516971588135, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8745595216751099, + "num_tokens": 566485271.0, + "step": 14845 + }, + { + "epoch": 1.8885637959547132, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5838478803634644, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.872229278087616, + "num_tokens": 566524341.0, + "step": 14846 + }, + { + "epoch": 1.8886910062333038, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6400364637374878, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8706566095352173, + "num_tokens": 566559750.0, + "step": 14847 + }, + { + "epoch": 1.8888182165118943, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5045700073242188, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8619363903999329, + "num_tokens": 566603369.0, + "step": 14848 + }, + { + "epoch": 1.8889454267904848, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6666128635406494, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8646821975708008, + "num_tokens": 566646464.0, + "step": 14849 + }, + { + "epoch": 1.8890726370690751, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5782204866409302, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8772737979888916, + "num_tokens": 566684462.0, + "step": 14850 + }, + { + "epoch": 1.8891998473476657, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6085453033447266, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8667566776275635, + "num_tokens": 566723136.0, + "step": 14851 + }, + { + "epoch": 1.8893270576262562, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5748748779296875, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8713582754135132, + "num_tokens": 566764579.0, + "step": 14852 + }, + { + "epoch": 1.8894542679048467, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6011853218078613, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8695434331893921, + "num_tokens": 566801319.0, + "step": 14853 + }, + { + "epoch": 1.8895814781834372, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.757511019706726, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8618915677070618, + "num_tokens": 566836688.0, + "step": 14854 + }, + { + "epoch": 1.8897086884620278, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6434779167175293, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8687763214111328, + "num_tokens": 566874408.0, + "step": 14855 + }, + { + "epoch": 1.889835898740618, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7162240743637085, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8536961674690247, + "num_tokens": 566908816.0, + "step": 14856 + }, + { + "epoch": 1.8899631090192086, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5552107095718384, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8841556310653687, + "num_tokens": 566944996.0, + "step": 14857 + }, + { + "epoch": 1.8900903192977991, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7302401065826416, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8634642958641052, + "num_tokens": 566986375.0, + "step": 14858 + }, + { + "epoch": 1.8902175295763897, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5503013134002686, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8609304428100586, + "num_tokens": 567031290.0, + "step": 14859 + }, + { + "epoch": 1.8903447398549802, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4889682531356812, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8761636018753052, + "num_tokens": 567070959.0, + "step": 14860 + }, + { + "epoch": 1.8904719501335707, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6403752565383911, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8425570130348206, + "num_tokens": 567112409.0, + "step": 14861 + }, + { + "epoch": 1.8905991604121613, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6997390985488892, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8573498129844666, + "num_tokens": 567147216.0, + "step": 14862 + }, + { + "epoch": 1.8907263706907518, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7027509212493896, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8541275262832642, + "num_tokens": 567189746.0, + "step": 14863 + }, + { + "epoch": 1.8908535809693423, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7564804553985596, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8718633651733398, + "num_tokens": 567221143.0, + "step": 14864 + }, + { + "epoch": 1.8909807912479328, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.834538221359253, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8747482299804688, + "num_tokens": 567253146.0, + "step": 14865 + }, + { + "epoch": 1.8911080015265234, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5494500398635864, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8672469258308411, + "num_tokens": 567294302.0, + "step": 14866 + }, + { + "epoch": 1.891235211805114, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.696509838104248, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8682355284690857, + "num_tokens": 567328884.0, + "step": 14867 + }, + { + "epoch": 1.8913624220837044, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.480596661567688, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8859457969665527, + "num_tokens": 567368672.0, + "step": 14868 + }, + { + "epoch": 1.891489632362295, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5787397623062134, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.869038462638855, + "num_tokens": 567404849.0, + "step": 14869 + }, + { + "epoch": 1.8916168426408855, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6983588933944702, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8788824081420898, + "num_tokens": 567440640.0, + "step": 14870 + }, + { + "epoch": 1.891744052919476, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7273505926132202, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8620824813842773, + "num_tokens": 567474856.0, + "step": 14871 + }, + { + "epoch": 1.8918712631980665, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5544756650924683, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8877667188644409, + "num_tokens": 567512058.0, + "step": 14872 + }, + { + "epoch": 1.891998473476657, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5918338298797607, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8789443969726562, + "num_tokens": 567551356.0, + "step": 14873 + }, + { + "epoch": 1.8921256837552476, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7335600852966309, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.859769880771637, + "num_tokens": 567585360.0, + "step": 14874 + }, + { + "epoch": 1.892252894033838, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6672160625457764, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.871320366859436, + "num_tokens": 567623396.0, + "step": 14875 + }, + { + "epoch": 1.8923801043124284, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7222784757614136, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8665575385093689, + "num_tokens": 567658139.0, + "step": 14876 + }, + { + "epoch": 1.892507314591019, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.614458680152893, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8636972904205322, + "num_tokens": 567698692.0, + "step": 14877 + }, + { + "epoch": 1.8926345248696095, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5841783285140991, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8720778226852417, + "num_tokens": 567734074.0, + "step": 14878 + }, + { + "epoch": 1.8927617351482, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6999105215072632, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8793125152587891, + "num_tokens": 567768960.0, + "step": 14879 + }, + { + "epoch": 1.8928889454267905, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6084461212158203, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8619758486747742, + "num_tokens": 567806506.0, + "step": 14880 + }, + { + "epoch": 1.8930161557053808, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5974398851394653, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8727144002914429, + "num_tokens": 567846079.0, + "step": 14881 + }, + { + "epoch": 1.8931433659839714, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7004566192626953, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8570743799209595, + "num_tokens": 567878514.0, + "step": 14882 + }, + { + "epoch": 1.893270576262562, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.587921142578125, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8635832071304321, + "num_tokens": 567920821.0, + "step": 14883 + }, + { + "epoch": 1.8933977865411524, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6089600324630737, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8629260063171387, + "num_tokens": 567963939.0, + "step": 14884 + }, + { + "epoch": 1.893524996819743, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6381324529647827, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8672776222229004, + "num_tokens": 568004171.0, + "step": 14885 + }, + { + "epoch": 1.8936522070983335, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7024548053741455, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8755267858505249, + "num_tokens": 568039594.0, + "step": 14886 + }, + { + "epoch": 1.893779417376924, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4950058460235596, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8844043016433716, + "num_tokens": 568077637.0, + "step": 14887 + }, + { + "epoch": 1.8939066276555145, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5732386112213135, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8752338886260986, + "num_tokens": 568114203.0, + "step": 14888 + }, + { + "epoch": 1.894033837934105, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6894752979278564, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.875128984451294, + "num_tokens": 568147715.0, + "step": 14889 + }, + { + "epoch": 1.8941610482126956, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.574445128440857, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8745941519737244, + "num_tokens": 568188670.0, + "step": 14890 + }, + { + "epoch": 1.8942882584912861, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.595441460609436, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8791412115097046, + "num_tokens": 568224441.0, + "step": 14891 + }, + { + "epoch": 1.8944154687698767, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7121946811676025, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8530526161193848, + "num_tokens": 568265194.0, + "step": 14892 + }, + { + "epoch": 1.8945426790484672, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.640990972518921, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.86238694190979, + "num_tokens": 568304334.0, + "step": 14893 + }, + { + "epoch": 1.8946698893270577, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5687955617904663, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8690679669380188, + "num_tokens": 568344312.0, + "step": 14894 + }, + { + "epoch": 1.8947970996056482, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7566672563552856, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8644378185272217, + "num_tokens": 568379647.0, + "step": 14895 + }, + { + "epoch": 1.8949243098842388, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6761212348937988, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.873022198677063, + "num_tokens": 568413996.0, + "step": 14896 + }, + { + "epoch": 1.8950515201628293, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7754360437393188, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8853904008865356, + "num_tokens": 568444745.0, + "step": 14897 + }, + { + "epoch": 1.8951787304414198, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5375338792800903, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8735078573226929, + "num_tokens": 568481982.0, + "step": 14898 + }, + { + "epoch": 1.8953059407200101, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6991820335388184, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8550320267677307, + "num_tokens": 568521265.0, + "step": 14899 + }, + { + "epoch": 1.8954331509986007, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5285558700561523, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8680551052093506, + "num_tokens": 568563015.0, + "step": 14900 + }, + { + "epoch": 1.8955603612771912, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.728338360786438, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8687775135040283, + "num_tokens": 568596567.0, + "step": 14901 + }, + { + "epoch": 1.8956875715557817, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6312599182128906, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8652676343917847, + "num_tokens": 568637674.0, + "step": 14902 + }, + { + "epoch": 1.8958147818343722, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5598410367965698, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.863187849521637, + "num_tokens": 568680161.0, + "step": 14903 + }, + { + "epoch": 1.8959419921129628, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4855490922927856, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8806579113006592, + "num_tokens": 568722056.0, + "step": 14904 + }, + { + "epoch": 1.896069202391553, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5365712642669678, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.882200300693512, + "num_tokens": 568761068.0, + "step": 14905 + }, + { + "epoch": 1.8961964126701436, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5684127807617188, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8524333238601685, + "num_tokens": 568804430.0, + "step": 14906 + }, + { + "epoch": 1.8963236229487341, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.544996738433838, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.877181351184845, + "num_tokens": 568842032.0, + "step": 14907 + }, + { + "epoch": 1.8964508332273247, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5561290979385376, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8860963582992554, + "num_tokens": 568878037.0, + "step": 14908 + }, + { + "epoch": 1.8965780435059152, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6459949016571045, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.854576051235199, + "num_tokens": 568915085.0, + "step": 14909 + }, + { + "epoch": 1.8967052537845057, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.426119327545166, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8760637044906616, + "num_tokens": 568960393.0, + "step": 14910 + }, + { + "epoch": 1.8968324640630962, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8658666610717773, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8575981855392456, + "num_tokens": 568993790.0, + "step": 14911 + }, + { + "epoch": 1.8969596743416868, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6389708518981934, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8721939325332642, + "num_tokens": 569030800.0, + "step": 14912 + }, + { + "epoch": 1.8970868846202773, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6123663187026978, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8771722316741943, + "num_tokens": 569068666.0, + "step": 14913 + }, + { + "epoch": 1.8972140948988678, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6890743970870972, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8662406206130981, + "num_tokens": 569104625.0, + "step": 14914 + }, + { + "epoch": 1.8973413051774584, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4552406072616577, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8816129565238953, + "num_tokens": 569147112.0, + "step": 14915 + }, + { + "epoch": 1.8974685154560489, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6059067249298096, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8584064245223999, + "num_tokens": 569184227.0, + "step": 14916 + }, + { + "epoch": 1.8975957257346394, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.481860637664795, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8770187497138977, + "num_tokens": 569227208.0, + "step": 14917 + }, + { + "epoch": 1.89772293601323, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6500946283340454, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.881172776222229, + "num_tokens": 569261147.0, + "step": 14918 + }, + { + "epoch": 1.8978501462918205, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6047803163528442, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8675243258476257, + "num_tokens": 569300477.0, + "step": 14919 + }, + { + "epoch": 1.897977356570411, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6474695205688477, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8830066919326782, + "num_tokens": 569336338.0, + "step": 14920 + }, + { + "epoch": 1.8981045668490015, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5222389698028564, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8914719223976135, + "num_tokens": 569376741.0, + "step": 14921 + }, + { + "epoch": 1.898231777127592, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.538366675376892, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8670510053634644, + "num_tokens": 569417946.0, + "step": 14922 + }, + { + "epoch": 1.8983589874061826, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6655232906341553, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8763328790664673, + "num_tokens": 569453855.0, + "step": 14923 + }, + { + "epoch": 1.898486197684773, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.550362467765808, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8628312349319458, + "num_tokens": 569496142.0, + "step": 14924 + }, + { + "epoch": 1.8986134079633634, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5640507936477661, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8787893056869507, + "num_tokens": 569532604.0, + "step": 14925 + }, + { + "epoch": 1.898740618241954, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6408461332321167, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8722208738327026, + "num_tokens": 569567418.0, + "step": 14926 + }, + { + "epoch": 1.8988678285205445, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5599130392074585, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8669657111167908, + "num_tokens": 569605559.0, + "step": 14927 + }, + { + "epoch": 1.898995038799135, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8435404300689697, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8812440037727356, + "num_tokens": 569637107.0, + "step": 14928 + }, + { + "epoch": 1.8991222490777255, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5638643503189087, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8683823347091675, + "num_tokens": 569675641.0, + "step": 14929 + }, + { + "epoch": 1.8992494593563158, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6372289657592773, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.870881199836731, + "num_tokens": 569713282.0, + "step": 14930 + }, + { + "epoch": 1.8993766696349064, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.53878653049469, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8748797178268433, + "num_tokens": 569750043.0, + "step": 14931 + }, + { + "epoch": 1.899503879913497, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5617438554763794, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.866914689540863, + "num_tokens": 569793434.0, + "step": 14932 + }, + { + "epoch": 1.8996310901920874, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5672322511672974, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8738936185836792, + "num_tokens": 569833241.0, + "step": 14933 + }, + { + "epoch": 1.899758300470678, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6101181507110596, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.876945972442627, + "num_tokens": 569868853.0, + "step": 14934 + }, + { + "epoch": 1.8998855107492685, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7708485126495361, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8650743961334229, + "num_tokens": 569905574.0, + "step": 14935 + }, + { + "epoch": 1.900012721027859, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.578683614730835, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.8927531242370605, + "num_tokens": 569941008.0, + "step": 14936 + }, + { + "epoch": 1.9001399313064495, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7568082809448242, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8653761148452759, + "num_tokens": 569973112.0, + "step": 14937 + }, + { + "epoch": 1.90026714158504, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5116207599639893, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8713380098342896, + "num_tokens": 570011929.0, + "step": 14938 + }, + { + "epoch": 1.9003943518636306, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5646181106567383, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8609668612480164, + "num_tokens": 570053501.0, + "step": 14939 + }, + { + "epoch": 1.9005215621422211, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6311577558517456, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8628150224685669, + "num_tokens": 570090504.0, + "step": 14940 + }, + { + "epoch": 1.9006487724208116, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5626455545425415, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8756142258644104, + "num_tokens": 570127261.0, + "step": 14941 + }, + { + "epoch": 1.9007759826994022, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.496534824371338, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8826366066932678, + "num_tokens": 570163254.0, + "step": 14942 + }, + { + "epoch": 1.9009031929779927, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5701979398727417, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8784643411636353, + "num_tokens": 570201961.0, + "step": 14943 + }, + { + "epoch": 1.9010304032565832, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5166478157043457, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8695902824401855, + "num_tokens": 570242826.0, + "step": 14944 + }, + { + "epoch": 1.9011576135351738, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7442049980163574, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8713056445121765, + "num_tokens": 570275796.0, + "step": 14945 + }, + { + "epoch": 1.9012848238137643, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6243351697921753, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8820634484291077, + "num_tokens": 570313582.0, + "step": 14946 + }, + { + "epoch": 1.9014120340923548, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7257441282272339, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8756862878799438, + "num_tokens": 570351835.0, + "step": 14947 + }, + { + "epoch": 1.9015392443709451, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5753529071807861, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8628664016723633, + "num_tokens": 570391722.0, + "step": 14948 + }, + { + "epoch": 1.9016664546495357, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5197038650512695, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8693667054176331, + "num_tokens": 570431385.0, + "step": 14949 + }, + { + "epoch": 1.9017936649281262, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6863007545471191, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8776237964630127, + "num_tokens": 570466607.0, + "step": 14950 + }, + { + "epoch": 1.9019208752067167, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7044146060943604, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8791673183441162, + "num_tokens": 570501227.0, + "step": 14951 + }, + { + "epoch": 1.9020480854853072, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6013838052749634, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8779340982437134, + "num_tokens": 570542363.0, + "step": 14952 + }, + { + "epoch": 1.9021752957638978, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.618223786354065, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8850691318511963, + "num_tokens": 570582043.0, + "step": 14953 + }, + { + "epoch": 1.902302506042488, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5379194021224976, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8706161975860596, + "num_tokens": 570622987.0, + "step": 14954 + }, + { + "epoch": 1.9024297163210786, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6551241874694824, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8663263320922852, + "num_tokens": 570657920.0, + "step": 14955 + }, + { + "epoch": 1.9025569265996691, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5407096147537231, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8884599208831787, + "num_tokens": 570692422.0, + "step": 14956 + }, + { + "epoch": 1.9026841368782597, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5393503904342651, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8780300617218018, + "num_tokens": 570734313.0, + "step": 14957 + }, + { + "epoch": 1.9028113471568502, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5871315002441406, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.885303258895874, + "num_tokens": 570771542.0, + "step": 14958 + }, + { + "epoch": 1.9029385574354407, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6043658256530762, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8658156394958496, + "num_tokens": 570808500.0, + "step": 14959 + }, + { + "epoch": 1.9030657677140312, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5530251264572144, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8889380097389221, + "num_tokens": 570847564.0, + "step": 14960 + }, + { + "epoch": 1.9031929779926218, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.608814001083374, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8717550039291382, + "num_tokens": 570885270.0, + "step": 14961 + }, + { + "epoch": 1.9033201882712123, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5437954664230347, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8823015093803406, + "num_tokens": 570923049.0, + "step": 14962 + }, + { + "epoch": 1.9034473985498028, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6720647811889648, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8668668866157532, + "num_tokens": 570957953.0, + "step": 14963 + }, + { + "epoch": 1.9035746088283934, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6560301780700684, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8711174130439758, + "num_tokens": 570993254.0, + "step": 14964 + }, + { + "epoch": 1.9037018191069839, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7659307718276978, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8704935312271118, + "num_tokens": 571030933.0, + "step": 14965 + }, + { + "epoch": 1.9038290293855744, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6940176486968994, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8730378746986389, + "num_tokens": 571066046.0, + "step": 14966 + }, + { + "epoch": 1.903956239664165, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6810969114303589, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8609065413475037, + "num_tokens": 571105323.0, + "step": 14967 + }, + { + "epoch": 1.9040834499427555, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6976321935653687, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8755416870117188, + "num_tokens": 571141170.0, + "step": 14968 + }, + { + "epoch": 1.904210660221346, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5991144180297852, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8622905015945435, + "num_tokens": 571185547.0, + "step": 14969 + }, + { + "epoch": 1.9043378704999365, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.719098687171936, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8699907660484314, + "num_tokens": 571219160.0, + "step": 14970 + }, + { + "epoch": 1.904465080778527, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6263045072555542, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.870692253112793, + "num_tokens": 571255122.0, + "step": 14971 + }, + { + "epoch": 1.9045922910571176, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.890434741973877, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.873328447341919, + "num_tokens": 571287010.0, + "step": 14972 + }, + { + "epoch": 1.9047195013357079, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6080353260040283, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8679707050323486, + "num_tokens": 571324425.0, + "step": 14973 + }, + { + "epoch": 1.9048467116142984, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5654816627502441, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8693612813949585, + "num_tokens": 571364893.0, + "step": 14974 + }, + { + "epoch": 1.904973921892889, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.423700213432312, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8899853825569153, + "num_tokens": 571409472.0, + "step": 14975 + }, + { + "epoch": 1.9051011321714795, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7567514181137085, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8571885824203491, + "num_tokens": 571444859.0, + "step": 14976 + }, + { + "epoch": 1.90522834245007, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.638617992401123, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8475203514099121, + "num_tokens": 571486533.0, + "step": 14977 + }, + { + "epoch": 1.9053555527286605, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.51521897315979, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8670294284820557, + "num_tokens": 571527937.0, + "step": 14978 + }, + { + "epoch": 1.9054827630072508, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5290966033935547, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.877102792263031, + "num_tokens": 571568727.0, + "step": 14979 + }, + { + "epoch": 1.9056099732858414, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6248087882995605, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.870530366897583, + "num_tokens": 571609176.0, + "step": 14980 + }, + { + "epoch": 1.905737183564432, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6036410331726074, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8743594884872437, + "num_tokens": 571648713.0, + "step": 14981 + }, + { + "epoch": 1.9058643938430224, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.683899998664856, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8724591732025146, + "num_tokens": 571683530.0, + "step": 14982 + }, + { + "epoch": 1.905991604121613, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.581642508506775, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8746688961982727, + "num_tokens": 571724155.0, + "step": 14983 + }, + { + "epoch": 1.9061188144002035, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.675792932510376, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8584064245223999, + "num_tokens": 571762719.0, + "step": 14984 + }, + { + "epoch": 1.906246024678794, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6353669166564941, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8671201467514038, + "num_tokens": 571802977.0, + "step": 14985 + }, + { + "epoch": 1.9063732349573845, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5915100574493408, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8802794814109802, + "num_tokens": 571839154.0, + "step": 14986 + }, + { + "epoch": 1.906500445235975, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.563867449760437, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8688780069351196, + "num_tokens": 571881553.0, + "step": 14987 + }, + { + "epoch": 1.9066276555145656, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6357332468032837, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8758350610733032, + "num_tokens": 571915230.0, + "step": 14988 + }, + { + "epoch": 1.9067548657931561, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5817371606826782, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8664753437042236, + "num_tokens": 571954073.0, + "step": 14989 + }, + { + "epoch": 1.9068820760717466, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6021143198013306, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8764971494674683, + "num_tokens": 571993110.0, + "step": 14990 + }, + { + "epoch": 1.9070092863503372, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.650192141532898, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8647565245628357, + "num_tokens": 572028757.0, + "step": 14991 + }, + { + "epoch": 1.9071364966289277, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6287480592727661, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8657994866371155, + "num_tokens": 572067105.0, + "step": 14992 + }, + { + "epoch": 1.9072637069075182, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.573813557624817, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8745479583740234, + "num_tokens": 572103593.0, + "step": 14993 + }, + { + "epoch": 1.9073909171861088, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.638909935951233, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8739979863166809, + "num_tokens": 572136142.0, + "step": 14994 + }, + { + "epoch": 1.9075181274646993, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5619105100631714, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8772104382514954, + "num_tokens": 572174358.0, + "step": 14995 + }, + { + "epoch": 1.9076453377432898, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6908069849014282, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8684258460998535, + "num_tokens": 572211944.0, + "step": 14996 + }, + { + "epoch": 1.9077725480218801, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6766343116760254, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8603341579437256, + "num_tokens": 572244931.0, + "step": 14997 + }, + { + "epoch": 1.9078997583004706, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7219823598861694, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8799710869789124, + "num_tokens": 572277626.0, + "step": 14998 + }, + { + "epoch": 1.9080269685790612, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7915058135986328, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8466225862503052, + "num_tokens": 572314408.0, + "step": 14999 + }, + { + "epoch": 1.9081541788576517, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5638349056243896, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8720470666885376, + "num_tokens": 572354870.0, + "step": 15000 + }, + { + "epoch": 1.9082813891362422, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.842037320137024, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8735975027084351, + "num_tokens": 572386762.0, + "step": 15001 + }, + { + "epoch": 1.9084085994148328, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5438231229782104, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8753895163536072, + "num_tokens": 572425879.0, + "step": 15002 + }, + { + "epoch": 1.908535809693423, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5333683490753174, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.873492956161499, + "num_tokens": 572466454.0, + "step": 15003 + }, + { + "epoch": 1.9086630199720136, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5621769428253174, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8587801456451416, + "num_tokens": 572511514.0, + "step": 15004 + }, + { + "epoch": 1.9087902302506041, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.655511736869812, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8840048313140869, + "num_tokens": 572543021.0, + "step": 15005 + }, + { + "epoch": 1.9089174405291947, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.606946587562561, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8688604831695557, + "num_tokens": 572579611.0, + "step": 15006 + }, + { + "epoch": 1.9090446508077852, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.609908938407898, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8630073070526123, + "num_tokens": 572616966.0, + "step": 15007 + }, + { + "epoch": 1.9091718610863757, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4893646240234375, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8743102550506592, + "num_tokens": 572659990.0, + "step": 15008 + }, + { + "epoch": 1.9092990713649662, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5707238912582397, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8689429759979248, + "num_tokens": 572699404.0, + "step": 15009 + }, + { + "epoch": 1.9094262816435568, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6548982858657837, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8621243238449097, + "num_tokens": 572735432.0, + "step": 15010 + }, + { + "epoch": 1.9095534919221473, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5309559106826782, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8759018778800964, + "num_tokens": 572773063.0, + "step": 15011 + }, + { + "epoch": 1.9096807022007378, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.508433222770691, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8668834567070007, + "num_tokens": 572818182.0, + "step": 15012 + }, + { + "epoch": 1.9098079124793284, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6903858184814453, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8707860708236694, + "num_tokens": 572853717.0, + "step": 15013 + }, + { + "epoch": 1.9099351227579189, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6985430717468262, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8497306108474731, + "num_tokens": 572894076.0, + "step": 15014 + }, + { + "epoch": 1.9100623330365094, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4682626724243164, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8790946006774902, + "num_tokens": 572937433.0, + "step": 15015 + }, + { + "epoch": 1.9101895433151, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7119957208633423, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8749012351036072, + "num_tokens": 572969990.0, + "step": 15016 + }, + { + "epoch": 1.9103167535936905, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5824612379074097, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8653151988983154, + "num_tokens": 573007402.0, + "step": 15017 + }, + { + "epoch": 1.910443963872281, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5287821292877197, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8894298076629639, + "num_tokens": 573044883.0, + "step": 15018 + }, + { + "epoch": 1.9105711741508715, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5568161010742188, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.866365909576416, + "num_tokens": 573084550.0, + "step": 15019 + }, + { + "epoch": 1.910698384429462, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4432928562164307, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8846539258956909, + "num_tokens": 573124593.0, + "step": 15020 + }, + { + "epoch": 1.9108255947080524, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4899582862854004, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.873457670211792, + "num_tokens": 573171732.0, + "step": 15021 + }, + { + "epoch": 1.9109528049866429, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7166297435760498, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8608535528182983, + "num_tokens": 573209973.0, + "step": 15022 + }, + { + "epoch": 1.9110800152652334, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7983132600784302, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8607940077781677, + "num_tokens": 573245669.0, + "step": 15023 + }, + { + "epoch": 1.911207225543824, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6305025815963745, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8659154176712036, + "num_tokens": 573284738.0, + "step": 15024 + }, + { + "epoch": 1.9113344358224145, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5984646081924438, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8756262063980103, + "num_tokens": 573324282.0, + "step": 15025 + }, + { + "epoch": 1.911461646101005, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6081804037094116, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.874508261680603, + "num_tokens": 573363739.0, + "step": 15026 + }, + { + "epoch": 1.9115888563795955, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6818588972091675, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.862730085849762, + "num_tokens": 573398998.0, + "step": 15027 + }, + { + "epoch": 1.9117160666581858, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.711334228515625, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8746562004089355, + "num_tokens": 573433816.0, + "step": 15028 + }, + { + "epoch": 1.9118432769367764, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6589093208312988, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8762697577476501, + "num_tokens": 573468382.0, + "step": 15029 + }, + { + "epoch": 1.9119704872153669, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7960690259933472, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8675212860107422, + "num_tokens": 573501129.0, + "step": 15030 + }, + { + "epoch": 1.9120976974939574, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4795193672180176, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8887634873390198, + "num_tokens": 573542393.0, + "step": 15031 + }, + { + "epoch": 1.912224907772548, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.507933259010315, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8722270727157593, + "num_tokens": 573580687.0, + "step": 15032 + }, + { + "epoch": 1.9123521180511385, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4453785419464111, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.886494517326355, + "num_tokens": 573624671.0, + "step": 15033 + }, + { + "epoch": 1.912479328329729, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6553740501403809, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8522542715072632, + "num_tokens": 573660170.0, + "step": 15034 + }, + { + "epoch": 1.9126065386083195, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.495247721672058, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8574398756027222, + "num_tokens": 573705686.0, + "step": 15035 + }, + { + "epoch": 1.91273374888691, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6198219060897827, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8698084354400635, + "num_tokens": 573743504.0, + "step": 15036 + }, + { + "epoch": 1.9128609591655006, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5666424036026, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8716790676116943, + "num_tokens": 573781133.0, + "step": 15037 + }, + { + "epoch": 1.9129881694440911, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7343804836273193, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8806759119033813, + "num_tokens": 573819548.0, + "step": 15038 + }, + { + "epoch": 1.9131153797226816, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5718300342559814, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.862750232219696, + "num_tokens": 573862573.0, + "step": 15039 + }, + { + "epoch": 1.9132425900012722, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5972267389297485, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8618481159210205, + "num_tokens": 573901782.0, + "step": 15040 + }, + { + "epoch": 1.9133698002798627, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6493849754333496, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8788556456565857, + "num_tokens": 573941534.0, + "step": 15041 + }, + { + "epoch": 1.9134970105584532, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5831551551818848, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8676691651344299, + "num_tokens": 573979985.0, + "step": 15042 + }, + { + "epoch": 1.9136242208370438, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4858920574188232, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8680497407913208, + "num_tokens": 574021123.0, + "step": 15043 + }, + { + "epoch": 1.9137514311156343, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 2.212658405303955, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8771014213562012, + "num_tokens": 574061654.0, + "step": 15044 + }, + { + "epoch": 1.9138786413942248, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5430537462234497, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8576295375823975, + "num_tokens": 574103776.0, + "step": 15045 + }, + { + "epoch": 1.9140058516728151, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5552690029144287, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8833140730857849, + "num_tokens": 574140374.0, + "step": 15046 + }, + { + "epoch": 1.9141330619514056, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5512568950653076, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8777498006820679, + "num_tokens": 574178649.0, + "step": 15047 + }, + { + "epoch": 1.9142602722299962, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.879173755645752, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8706325888633728, + "num_tokens": 574216521.0, + "step": 15048 + }, + { + "epoch": 1.9143874825085867, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5905587673187256, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8720311522483826, + "num_tokens": 574253402.0, + "step": 15049 + }, + { + "epoch": 1.9145146927871772, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.449532389640808, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8772851824760437, + "num_tokens": 574298219.0, + "step": 15050 + }, + { + "epoch": 1.9146419030657678, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5235588550567627, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8715798854827881, + "num_tokens": 574336924.0, + "step": 15051 + }, + { + "epoch": 1.914769113344358, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.54385244846344, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8589435815811157, + "num_tokens": 574377653.0, + "step": 15052 + }, + { + "epoch": 1.9148963236229486, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5113023519515991, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8786804676055908, + "num_tokens": 574417660.0, + "step": 15053 + }, + { + "epoch": 1.9150235339015391, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5320839881896973, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8724993467330933, + "num_tokens": 574457415.0, + "step": 15054 + }, + { + "epoch": 1.9151507441801296, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6204618215560913, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8735868334770203, + "num_tokens": 574494139.0, + "step": 15055 + }, + { + "epoch": 1.9152779544587202, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5965309143066406, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8722770810127258, + "num_tokens": 574531357.0, + "step": 15056 + }, + { + "epoch": 1.9154051647373107, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4750186204910278, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8644200563430786, + "num_tokens": 574577653.0, + "step": 15057 + }, + { + "epoch": 1.9155323750159012, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6670928001403809, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8683971166610718, + "num_tokens": 574616870.0, + "step": 15058 + }, + { + "epoch": 1.9156595852944918, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5270042419433594, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8607814908027649, + "num_tokens": 574660468.0, + "step": 15059 + }, + { + "epoch": 1.9157867955730823, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.455462098121643, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8742415904998779, + "num_tokens": 574704495.0, + "step": 15060 + }, + { + "epoch": 1.9159140058516728, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6838405132293701, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8805704116821289, + "num_tokens": 574743306.0, + "step": 15061 + }, + { + "epoch": 1.9160412161302633, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.673660397529602, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8780264854431152, + "num_tokens": 574781086.0, + "step": 15062 + }, + { + "epoch": 1.9161684264088539, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 2.220956325531006, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.868328869342804, + "num_tokens": 574818401.0, + "step": 15063 + }, + { + "epoch": 1.9162956366874444, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6262246370315552, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8707805871963501, + "num_tokens": 574854185.0, + "step": 15064 + }, + { + "epoch": 1.916422846966035, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5752733945846558, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8774917721748352, + "num_tokens": 574894461.0, + "step": 15065 + }, + { + "epoch": 1.9165500572446255, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 3.76875901222229, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8768340945243835, + "num_tokens": 574927918.0, + "step": 15066 + }, + { + "epoch": 1.916677267523216, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7750115394592285, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8716031312942505, + "num_tokens": 574965586.0, + "step": 15067 + }, + { + "epoch": 1.9168044778018065, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.759507417678833, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8560783267021179, + "num_tokens": 574997929.0, + "step": 15068 + }, + { + "epoch": 1.916931688080397, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6684781312942505, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8693757057189941, + "num_tokens": 575032761.0, + "step": 15069 + }, + { + "epoch": 1.9170588983589874, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.693293571472168, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8653924465179443, + "num_tokens": 575070183.0, + "step": 15070 + }, + { + "epoch": 1.9171861086375779, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6473690271377563, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8823837041854858, + "num_tokens": 575108347.0, + "step": 15071 + }, + { + "epoch": 1.9173133189161684, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6534839868545532, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8849073052406311, + "num_tokens": 575142244.0, + "step": 15072 + }, + { + "epoch": 1.917440529194759, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7934316396713257, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8637660145759583, + "num_tokens": 575173499.0, + "step": 15073 + }, + { + "epoch": 1.9175677394733495, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6513831615447998, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8702938556671143, + "num_tokens": 575209771.0, + "step": 15074 + }, + { + "epoch": 1.91769494975194, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4216196537017822, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8768190741539001, + "num_tokens": 575255702.0, + "step": 15075 + }, + { + "epoch": 1.9178221600305305, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6171822547912598, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8769516348838806, + "num_tokens": 575290687.0, + "step": 15076 + }, + { + "epoch": 1.9179493703091208, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.691007375717163, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8800625801086426, + "num_tokens": 575322985.0, + "step": 15077 + }, + { + "epoch": 1.9180765805877114, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6719231605529785, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8758856058120728, + "num_tokens": 575361174.0, + "step": 15078 + }, + { + "epoch": 1.9182037908663019, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5962520837783813, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8700414896011353, + "num_tokens": 575399727.0, + "step": 15079 + }, + { + "epoch": 1.9183310011448924, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5713918209075928, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8677483797073364, + "num_tokens": 575438261.0, + "step": 15080 + }, + { + "epoch": 1.918458211423483, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.512345552444458, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8633394837379456, + "num_tokens": 575480369.0, + "step": 15081 + }, + { + "epoch": 1.9185854217020735, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6712383031845093, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8656660318374634, + "num_tokens": 575522285.0, + "step": 15082 + }, + { + "epoch": 1.918712631980664, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5779502391815186, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.864901602268219, + "num_tokens": 575564444.0, + "step": 15083 + }, + { + "epoch": 1.9188398422592545, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6897839307785034, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8748602271080017, + "num_tokens": 575600564.0, + "step": 15084 + }, + { + "epoch": 1.918967052537845, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.557691216468811, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8651605248451233, + "num_tokens": 575642367.0, + "step": 15085 + }, + { + "epoch": 1.9190942628164356, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5561999082565308, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.882077693939209, + "num_tokens": 575677869.0, + "step": 15086 + }, + { + "epoch": 1.919221473095026, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.70058274269104, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8700350522994995, + "num_tokens": 575712262.0, + "step": 15087 + }, + { + "epoch": 1.9193486833736166, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6916582584381104, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8862553834915161, + "num_tokens": 575748301.0, + "step": 15088 + }, + { + "epoch": 1.9194758936522072, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5868340730667114, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8700359463691711, + "num_tokens": 575785670.0, + "step": 15089 + }, + { + "epoch": 1.9196031039307977, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.629298210144043, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8743836283683777, + "num_tokens": 575820041.0, + "step": 15090 + }, + { + "epoch": 1.9197303142093882, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4715747833251953, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8781909346580505, + "num_tokens": 575863034.0, + "step": 15091 + }, + { + "epoch": 1.9198575244879788, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6075283288955688, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8809384107589722, + "num_tokens": 575897585.0, + "step": 15092 + }, + { + "epoch": 1.9199847347665693, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5758129358291626, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.871321439743042, + "num_tokens": 575935430.0, + "step": 15093 + }, + { + "epoch": 1.9201119450451598, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7370513677597046, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8755823969841003, + "num_tokens": 575968053.0, + "step": 15094 + }, + { + "epoch": 1.9202391553237501, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.643657922744751, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8723330497741699, + "num_tokens": 576003170.0, + "step": 15095 + }, + { + "epoch": 1.9203663656023406, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6504014730453491, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8637484908103943, + "num_tokens": 576040373.0, + "step": 15096 + }, + { + "epoch": 1.9204935758809312, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4356142282485962, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.878247857093811, + "num_tokens": 576086480.0, + "step": 15097 + }, + { + "epoch": 1.9206207861595217, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.695534110069275, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8678156137466431, + "num_tokens": 576125538.0, + "step": 15098 + }, + { + "epoch": 1.9207479964381122, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7312504053115845, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8539037704467773, + "num_tokens": 576164067.0, + "step": 15099 + }, + { + "epoch": 1.9208752067167028, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6839306354522705, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8570998311042786, + "num_tokens": 576203072.0, + "step": 15100 + }, + { + "epoch": 1.921002416995293, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6717063188552856, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8576785326004028, + "num_tokens": 576240001.0, + "step": 15101 + }, + { + "epoch": 1.9211296272738836, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6200894117355347, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8797692060470581, + "num_tokens": 576278871.0, + "step": 15102 + }, + { + "epoch": 1.9212568375524741, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.648559331893921, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8718990087509155, + "num_tokens": 576315105.0, + "step": 15103 + }, + { + "epoch": 1.9213840478310646, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4722349643707275, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8789650201797485, + "num_tokens": 576356718.0, + "step": 15104 + }, + { + "epoch": 1.9215112581096552, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.599481463432312, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8735106587409973, + "num_tokens": 576392959.0, + "step": 15105 + }, + { + "epoch": 1.9216384683882457, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7423782348632812, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8698899149894714, + "num_tokens": 576425859.0, + "step": 15106 + }, + { + "epoch": 1.9217656786668362, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.570847511291504, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.873305082321167, + "num_tokens": 576465068.0, + "step": 15107 + }, + { + "epoch": 1.9218928889454268, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6814420223236084, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8608540296554565, + "num_tokens": 576502335.0, + "step": 15108 + }, + { + "epoch": 1.9220200992240173, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.9127851724624634, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8647589683532715, + "num_tokens": 576532091.0, + "step": 15109 + }, + { + "epoch": 1.9221473095026078, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6433097124099731, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8740005493164062, + "num_tokens": 576567065.0, + "step": 15110 + }, + { + "epoch": 1.9222745197811983, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7874746322631836, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8767566680908203, + "num_tokens": 576598544.0, + "step": 15111 + }, + { + "epoch": 1.9224017300597889, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5534849166870117, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8828462362289429, + "num_tokens": 576638032.0, + "step": 15112 + }, + { + "epoch": 1.9225289403383794, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6326282024383545, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8685005307197571, + "num_tokens": 576676135.0, + "step": 15113 + }, + { + "epoch": 1.92265615061697, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.514583945274353, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8657814264297485, + "num_tokens": 576719395.0, + "step": 15114 + }, + { + "epoch": 1.9227833608955605, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5805152654647827, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8613629937171936, + "num_tokens": 576761085.0, + "step": 15115 + }, + { + "epoch": 1.922910571174151, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5906497240066528, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8778049945831299, + "num_tokens": 576799277.0, + "step": 15116 + }, + { + "epoch": 1.9230377814527415, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7239381074905396, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8664839267730713, + "num_tokens": 576832118.0, + "step": 15117 + }, + { + "epoch": 1.923164991731332, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7519605159759521, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.874058187007904, + "num_tokens": 576865602.0, + "step": 15118 + }, + { + "epoch": 1.9232922020099223, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.741463303565979, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8760462403297424, + "num_tokens": 576900527.0, + "step": 15119 + }, + { + "epoch": 1.9234194122885129, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.63286554813385, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8805460333824158, + "num_tokens": 576938225.0, + "step": 15120 + }, + { + "epoch": 1.9235466225671034, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7298023700714111, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8741985559463501, + "num_tokens": 576970123.0, + "step": 15121 + }, + { + "epoch": 1.923673832845694, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.760775089263916, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8573685884475708, + "num_tokens": 577002918.0, + "step": 15122 + }, + { + "epoch": 1.9238010431242845, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6027582883834839, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.873124897480011, + "num_tokens": 577040899.0, + "step": 15123 + }, + { + "epoch": 1.923928253402875, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.652251958847046, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8688647747039795, + "num_tokens": 577079458.0, + "step": 15124 + }, + { + "epoch": 1.9240554636814655, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7757577896118164, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8772563934326172, + "num_tokens": 577110821.0, + "step": 15125 + }, + { + "epoch": 1.9241826739600558, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6533523797988892, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8617112636566162, + "num_tokens": 577151608.0, + "step": 15126 + }, + { + "epoch": 1.9243098842386464, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6276631355285645, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8589702844619751, + "num_tokens": 577192438.0, + "step": 15127 + }, + { + "epoch": 1.9244370945172369, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5435365438461304, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8717145919799805, + "num_tokens": 577233399.0, + "step": 15128 + }, + { + "epoch": 1.9245643047958274, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6655875444412231, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8751912713050842, + "num_tokens": 577270769.0, + "step": 15129 + }, + { + "epoch": 1.924691515074418, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.77730131149292, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8772951364517212, + "num_tokens": 577302283.0, + "step": 15130 + }, + { + "epoch": 1.9248187253530085, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6654837131500244, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8629777431488037, + "num_tokens": 577336459.0, + "step": 15131 + }, + { + "epoch": 1.924945935631599, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6049822568893433, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8550772666931152, + "num_tokens": 577376224.0, + "step": 15132 + }, + { + "epoch": 1.9250731459101895, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6043503284454346, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8642135262489319, + "num_tokens": 577413878.0, + "step": 15133 + }, + { + "epoch": 1.92520035618878, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5965008735656738, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8809075951576233, + "num_tokens": 577450901.0, + "step": 15134 + }, + { + "epoch": 1.9253275664673706, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.664972186088562, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8519931435585022, + "num_tokens": 577489616.0, + "step": 15135 + }, + { + "epoch": 1.925454776745961, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5893573760986328, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8705200552940369, + "num_tokens": 577528120.0, + "step": 15136 + }, + { + "epoch": 1.9255819870245516, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6409457921981812, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8774956464767456, + "num_tokens": 577562691.0, + "step": 15137 + }, + { + "epoch": 1.9257091973031422, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4897608757019043, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8783916234970093, + "num_tokens": 577600480.0, + "step": 15138 + }, + { + "epoch": 1.9258364075817327, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6242430210113525, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8840997815132141, + "num_tokens": 577633034.0, + "step": 15139 + }, + { + "epoch": 1.9259636178603232, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6595773696899414, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8467146158218384, + "num_tokens": 577671627.0, + "step": 15140 + }, + { + "epoch": 1.9260908281389137, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6553497314453125, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8719786405563354, + "num_tokens": 577704911.0, + "step": 15141 + }, + { + "epoch": 1.9262180384175043, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5267764329910278, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8885195255279541, + "num_tokens": 577738732.0, + "step": 15142 + }, + { + "epoch": 1.9263452486960948, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5967460870742798, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8761581182479858, + "num_tokens": 577776147.0, + "step": 15143 + }, + { + "epoch": 1.926472458974685, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5083668231964111, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8824012279510498, + "num_tokens": 577817290.0, + "step": 15144 + }, + { + "epoch": 1.9265996692532756, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7591278553009033, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8668522834777832, + "num_tokens": 577847739.0, + "step": 15145 + }, + { + "epoch": 1.9267268795318662, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5758955478668213, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8795790076255798, + "num_tokens": 577885656.0, + "step": 15146 + }, + { + "epoch": 1.9268540898104567, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5911080837249756, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8750324845314026, + "num_tokens": 577929376.0, + "step": 15147 + }, + { + "epoch": 1.9269813000890472, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5744972229003906, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8562845587730408, + "num_tokens": 577969112.0, + "step": 15148 + }, + { + "epoch": 1.9271085103676378, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5047776699066162, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8646469712257385, + "num_tokens": 578012258.0, + "step": 15149 + }, + { + "epoch": 1.927235720646228, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5820579528808594, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8700059652328491, + "num_tokens": 578051628.0, + "step": 15150 + }, + { + "epoch": 1.9273629309248186, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.525508999824524, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8551947474479675, + "num_tokens": 578092980.0, + "step": 15151 + }, + { + "epoch": 1.9274901412034091, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5984359979629517, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8753957748413086, + "num_tokens": 578133476.0, + "step": 15152 + }, + { + "epoch": 1.9276173514819996, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5696316957473755, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8668054938316345, + "num_tokens": 578175888.0, + "step": 15153 + }, + { + "epoch": 1.9277445617605902, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6376628875732422, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8658708930015564, + "num_tokens": 578211911.0, + "step": 15154 + }, + { + "epoch": 1.9278717720391807, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.738136649131775, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8769294023513794, + "num_tokens": 578244192.0, + "step": 15155 + }, + { + "epoch": 1.9279989823177712, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5521570444107056, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8530554175376892, + "num_tokens": 578289384.0, + "step": 15156 + }, + { + "epoch": 1.9281261925963618, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6330305337905884, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8602721095085144, + "num_tokens": 578330129.0, + "step": 15157 + }, + { + "epoch": 1.9282534028749523, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7889949083328247, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8689877986907959, + "num_tokens": 578363296.0, + "step": 15158 + }, + { + "epoch": 1.9283806131535428, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5821774005889893, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8817316293716431, + "num_tokens": 578404573.0, + "step": 15159 + }, + { + "epoch": 1.9285078234321333, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6527451276779175, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8872039914131165, + "num_tokens": 578436177.0, + "step": 15160 + }, + { + "epoch": 1.9286350337107239, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.474948525428772, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.88132643699646, + "num_tokens": 578479468.0, + "step": 15161 + }, + { + "epoch": 1.9287622439893144, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.438213586807251, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8835736513137817, + "num_tokens": 578524039.0, + "step": 15162 + }, + { + "epoch": 1.928889454267905, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4958086013793945, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.87647545337677, + "num_tokens": 578565318.0, + "step": 15163 + }, + { + "epoch": 1.9290166645464955, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7060166597366333, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8747164011001587, + "num_tokens": 578598888.0, + "step": 15164 + }, + { + "epoch": 1.929143874825086, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.702041745185852, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8638119101524353, + "num_tokens": 578639581.0, + "step": 15165 + }, + { + "epoch": 1.9292710851036765, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7150518894195557, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8605223894119263, + "num_tokens": 578675041.0, + "step": 15166 + }, + { + "epoch": 1.929398295382267, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.656206727027893, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8777461051940918, + "num_tokens": 578716759.0, + "step": 15167 + }, + { + "epoch": 1.9295255056608573, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.492283821105957, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8778020143508911, + "num_tokens": 578758765.0, + "step": 15168 + }, + { + "epoch": 1.9296527159394479, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.707047700881958, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8536734580993652, + "num_tokens": 578795717.0, + "step": 15169 + }, + { + "epoch": 1.9297799262180384, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6000772714614868, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8759602308273315, + "num_tokens": 578834223.0, + "step": 15170 + }, + { + "epoch": 1.929907136496629, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5278080701828003, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8771600127220154, + "num_tokens": 578871664.0, + "step": 15171 + }, + { + "epoch": 1.9300343467752195, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.492050290107727, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8646619319915771, + "num_tokens": 578915191.0, + "step": 15172 + }, + { + "epoch": 1.93016155705381, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.738682746887207, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8641996383666992, + "num_tokens": 578952516.0, + "step": 15173 + }, + { + "epoch": 1.9302887673324005, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.733637809753418, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8670893907546997, + "num_tokens": 578989280.0, + "step": 15174 + }, + { + "epoch": 1.9304159776109908, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6124407052993774, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8641753196716309, + "num_tokens": 579027177.0, + "step": 15175 + }, + { + "epoch": 1.9305431878895813, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.596679925918579, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8592827320098877, + "num_tokens": 579064483.0, + "step": 15176 + }, + { + "epoch": 1.9306703981681719, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5755120515823364, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8816497325897217, + "num_tokens": 579102755.0, + "step": 15177 + }, + { + "epoch": 1.9307976084467624, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.774060845375061, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8800264596939087, + "num_tokens": 579133909.0, + "step": 15178 + }, + { + "epoch": 1.930924818725353, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.565929651260376, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8689984083175659, + "num_tokens": 579171532.0, + "step": 15179 + }, + { + "epoch": 1.9310520290039435, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.655316948890686, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8670474886894226, + "num_tokens": 579208461.0, + "step": 15180 + }, + { + "epoch": 1.931179239282534, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.693908929824829, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8624305725097656, + "num_tokens": 579244841.0, + "step": 15181 + }, + { + "epoch": 1.9313064495611245, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6102714538574219, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8712270855903625, + "num_tokens": 579282255.0, + "step": 15182 + }, + { + "epoch": 1.931433659839715, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8800328969955444, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8672285079956055, + "num_tokens": 579313123.0, + "step": 15183 + }, + { + "epoch": 1.9315608701183056, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.536352515220642, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8847939968109131, + "num_tokens": 579350161.0, + "step": 15184 + }, + { + "epoch": 1.931688080396896, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5936373472213745, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8715484142303467, + "num_tokens": 579389310.0, + "step": 15185 + }, + { + "epoch": 1.9318152906754866, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5414777994155884, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8704643845558167, + "num_tokens": 579429334.0, + "step": 15186 + }, + { + "epoch": 1.9319425009540772, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.580195426940918, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8738318085670471, + "num_tokens": 579469934.0, + "step": 15187 + }, + { + "epoch": 1.9320697112326677, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.632516622543335, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8570969700813293, + "num_tokens": 579511258.0, + "step": 15188 + }, + { + "epoch": 1.9321969215112582, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4726226329803467, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8818222284317017, + "num_tokens": 579551879.0, + "step": 15189 + }, + { + "epoch": 1.9323241317898487, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6330751180648804, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8733423352241516, + "num_tokens": 579587315.0, + "step": 15190 + }, + { + "epoch": 1.9324513420684393, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6242787837982178, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8639219999313354, + "num_tokens": 579624712.0, + "step": 15191 + }, + { + "epoch": 1.9325785523470298, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5775598287582397, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.876271665096283, + "num_tokens": 579665183.0, + "step": 15192 + }, + { + "epoch": 1.93270576262562, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7513378858566284, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8650730848312378, + "num_tokens": 579699334.0, + "step": 15193 + }, + { + "epoch": 1.9328329729042106, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6341397762298584, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8805752396583557, + "num_tokens": 579735463.0, + "step": 15194 + }, + { + "epoch": 1.9329601831828012, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4988290071487427, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8747463226318359, + "num_tokens": 579778243.0, + "step": 15195 + }, + { + "epoch": 1.9330873934613917, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7056173086166382, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.876632571220398, + "num_tokens": 579810007.0, + "step": 15196 + }, + { + "epoch": 1.9332146037399822, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.602612853050232, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8758898377418518, + "num_tokens": 579847276.0, + "step": 15197 + }, + { + "epoch": 1.9333418140185727, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5035678148269653, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.869314432144165, + "num_tokens": 579885454.0, + "step": 15198 + }, + { + "epoch": 1.933469024297163, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7374424934387207, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8671270608901978, + "num_tokens": 579919292.0, + "step": 15199 + }, + { + "epoch": 1.9335962345757536, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5366194248199463, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8690390586853027, + "num_tokens": 579958889.0, + "step": 15200 + }, + { + "epoch": 1.933723444854344, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5798265933990479, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8826218247413635, + "num_tokens": 579992689.0, + "step": 15201 + }, + { + "epoch": 1.9338506551329346, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5720986127853394, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8629865646362305, + "num_tokens": 580033087.0, + "step": 15202 + }, + { + "epoch": 1.9339778654115252, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7158634662628174, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8510735630989075, + "num_tokens": 580067535.0, + "step": 15203 + }, + { + "epoch": 1.9341050756901157, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6060247421264648, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.863018274307251, + "num_tokens": 580107219.0, + "step": 15204 + }, + { + "epoch": 1.9342322859687062, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6066817045211792, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8765615820884705, + "num_tokens": 580143838.0, + "step": 15205 + }, + { + "epoch": 1.9343594962472968, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 2.0258967876434326, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8486555218696594, + "num_tokens": 580173194.0, + "step": 15206 + }, + { + "epoch": 1.9344867065258873, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7300869226455688, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.871743381023407, + "num_tokens": 580207116.0, + "step": 15207 + }, + { + "epoch": 1.9346139168044778, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.643944501876831, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8687425851821899, + "num_tokens": 580241588.0, + "step": 15208 + }, + { + "epoch": 1.9347411270830683, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5204683542251587, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8815522193908691, + "num_tokens": 580279469.0, + "step": 15209 + }, + { + "epoch": 1.9348683373616589, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.612520456314087, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8656159043312073, + "num_tokens": 580317645.0, + "step": 15210 + }, + { + "epoch": 1.9349955476402494, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.640200138092041, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.873684823513031, + "num_tokens": 580356558.0, + "step": 15211 + }, + { + "epoch": 1.93512275791884, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6092545986175537, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8562067747116089, + "num_tokens": 580393782.0, + "step": 15212 + }, + { + "epoch": 1.9352499681974304, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.729673981666565, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8776130080223083, + "num_tokens": 580427758.0, + "step": 15213 + }, + { + "epoch": 1.935377178476021, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5623836517333984, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8682854175567627, + "num_tokens": 580468791.0, + "step": 15214 + }, + { + "epoch": 1.9355043887546115, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5804682970046997, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8863611221313477, + "num_tokens": 580502620.0, + "step": 15215 + }, + { + "epoch": 1.935631599033202, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5283740758895874, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8818939328193665, + "num_tokens": 580540548.0, + "step": 15216 + }, + { + "epoch": 1.9357588093117923, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6448296308517456, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8686279058456421, + "num_tokens": 580577191.0, + "step": 15217 + }, + { + "epoch": 1.9358860195903829, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6586841344833374, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8538901805877686, + "num_tokens": 580614598.0, + "step": 15218 + }, + { + "epoch": 1.9360132298689734, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6238007545471191, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8649753928184509, + "num_tokens": 580653438.0, + "step": 15219 + }, + { + "epoch": 1.936140440147564, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5796042680740356, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8635340929031372, + "num_tokens": 580691534.0, + "step": 15220 + }, + { + "epoch": 1.9362676504261545, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6130070686340332, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8741670250892639, + "num_tokens": 580725769.0, + "step": 15221 + }, + { + "epoch": 1.936394860704745, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4720720052719116, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8791848421096802, + "num_tokens": 580768621.0, + "step": 15222 + }, + { + "epoch": 1.9365220709833355, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5113513469696045, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8879251480102539, + "num_tokens": 580806132.0, + "step": 15223 + }, + { + "epoch": 1.9366492812619258, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7857075929641724, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8564176559448242, + "num_tokens": 580841456.0, + "step": 15224 + }, + { + "epoch": 1.9367764915405163, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5239942073822021, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8829450607299805, + "num_tokens": 580880387.0, + "step": 15225 + }, + { + "epoch": 1.9369037018191069, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6555564403533936, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.862785816192627, + "num_tokens": 580924104.0, + "step": 15226 + }, + { + "epoch": 1.9370309120976974, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6314759254455566, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8655843734741211, + "num_tokens": 580960634.0, + "step": 15227 + }, + { + "epoch": 1.937158122376288, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6011738777160645, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8774442672729492, + "num_tokens": 580999581.0, + "step": 15228 + }, + { + "epoch": 1.9372853326548785, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.491790533065796, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.882370114326477, + "num_tokens": 581042122.0, + "step": 15229 + }, + { + "epoch": 1.937412542933469, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7514324188232422, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8713024854660034, + "num_tokens": 581074395.0, + "step": 15230 + }, + { + "epoch": 1.9375397532120595, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.747073769569397, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8606040477752686, + "num_tokens": 581114072.0, + "step": 15231 + }, + { + "epoch": 1.93766696349065, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7484416961669922, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8727549314498901, + "num_tokens": 581151479.0, + "step": 15232 + }, + { + "epoch": 1.9377941737692406, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7311736345291138, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8842803239822388, + "num_tokens": 581189805.0, + "step": 15233 + }, + { + "epoch": 1.937921384047831, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6047704219818115, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.870319664478302, + "num_tokens": 581226498.0, + "step": 15234 + }, + { + "epoch": 1.9380485943264216, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5482442378997803, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8782160878181458, + "num_tokens": 581263652.0, + "step": 15235 + }, + { + "epoch": 1.9381758046050122, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5800378322601318, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8617517948150635, + "num_tokens": 581301255.0, + "step": 15236 + }, + { + "epoch": 1.9383030148836027, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8098301887512207, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8739814758300781, + "num_tokens": 581335634.0, + "step": 15237 + }, + { + "epoch": 1.9384302251621932, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7298463582992554, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8555246591567993, + "num_tokens": 581378518.0, + "step": 15238 + }, + { + "epoch": 1.9385574354407837, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7593642473220825, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8638492822647095, + "num_tokens": 581418196.0, + "step": 15239 + }, + { + "epoch": 1.9386846457193743, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 3.7422683238983154, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.862265408039093, + "num_tokens": 581454536.0, + "step": 15240 + }, + { + "epoch": 1.9388118559979648, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.721662163734436, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8690940141677856, + "num_tokens": 581488066.0, + "step": 15241 + }, + { + "epoch": 1.938939066276555, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6515790224075317, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8657329082489014, + "num_tokens": 581526890.0, + "step": 15242 + }, + { + "epoch": 1.9390662765551456, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6211555004119873, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8664212822914124, + "num_tokens": 581568023.0, + "step": 15243 + }, + { + "epoch": 1.9391934868337362, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6111335754394531, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8716234564781189, + "num_tokens": 581604416.0, + "step": 15244 + }, + { + "epoch": 1.9393206971123267, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5596835613250732, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8777806758880615, + "num_tokens": 581644108.0, + "step": 15245 + }, + { + "epoch": 1.9394479073909172, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5767408609390259, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8839157223701477, + "num_tokens": 581681000.0, + "step": 15246 + }, + { + "epoch": 1.9395751176695077, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5665347576141357, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.866206169128418, + "num_tokens": 581720731.0, + "step": 15247 + }, + { + "epoch": 1.939702327948098, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.54453444480896, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8714661002159119, + "num_tokens": 581759507.0, + "step": 15248 + }, + { + "epoch": 1.9398295382266886, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6491838693618774, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8735511898994446, + "num_tokens": 581797310.0, + "step": 15249 + }, + { + "epoch": 1.939956748505279, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7882835865020752, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8551668524742126, + "num_tokens": 581831321.0, + "step": 15250 + }, + { + "epoch": 1.9400839587838696, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6407701969146729, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8829739093780518, + "num_tokens": 581864598.0, + "step": 15251 + }, + { + "epoch": 1.9402111690624602, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.63959801197052, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8666534423828125, + "num_tokens": 581903059.0, + "step": 15252 + }, + { + "epoch": 1.9403383793410507, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5857620239257812, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8673713207244873, + "num_tokens": 581942173.0, + "step": 15253 + }, + { + "epoch": 1.9404655896196412, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7676883935928345, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8646599054336548, + "num_tokens": 581979008.0, + "step": 15254 + }, + { + "epoch": 1.9405927998982317, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5822187662124634, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8726997375488281, + "num_tokens": 582018650.0, + "step": 15255 + }, + { + "epoch": 1.9407200101768223, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6485595703125, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8818427324295044, + "num_tokens": 582050014.0, + "step": 15256 + }, + { + "epoch": 1.9408472204554128, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5109673738479614, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8825491070747375, + "num_tokens": 582087806.0, + "step": 15257 + }, + { + "epoch": 1.9409744307340033, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.509634017944336, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.872962236404419, + "num_tokens": 582133053.0, + "step": 15258 + }, + { + "epoch": 1.9411016410125939, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6370658874511719, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8740127086639404, + "num_tokens": 582166551.0, + "step": 15259 + }, + { + "epoch": 1.9412288512911844, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7425347566604614, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8772560954093933, + "num_tokens": 582202163.0, + "step": 15260 + }, + { + "epoch": 1.941356061569775, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7662190198898315, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8637018203735352, + "num_tokens": 582238558.0, + "step": 15261 + }, + { + "epoch": 1.9414832718483654, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.612233281135559, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8750494718551636, + "num_tokens": 582275960.0, + "step": 15262 + }, + { + "epoch": 1.941610482126956, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 2.2547543048858643, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8598552942276001, + "num_tokens": 582316539.0, + "step": 15263 + }, + { + "epoch": 1.9417376924055465, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.687666654586792, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8561999797821045, + "num_tokens": 582351665.0, + "step": 15264 + }, + { + "epoch": 1.941864902684137, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5487606525421143, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8693403005599976, + "num_tokens": 582392861.0, + "step": 15265 + }, + { + "epoch": 1.9419921129627273, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4582916498184204, + "learning_rate": 1e-06, + "loss": 0.3309, + "mean_token_accuracy": 0.8913543224334717, + "num_tokens": 582436491.0, + "step": 15266 + }, + { + "epoch": 1.9421193232413179, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6652323007583618, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8765753507614136, + "num_tokens": 582474395.0, + "step": 15267 + }, + { + "epoch": 1.9422465335199084, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5606220960617065, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8510922193527222, + "num_tokens": 582517537.0, + "step": 15268 + }, + { + "epoch": 1.942373743798499, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5705372095108032, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.879356861114502, + "num_tokens": 582557437.0, + "step": 15269 + }, + { + "epoch": 1.9425009540770894, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.909830093383789, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8558991551399231, + "num_tokens": 582594324.0, + "step": 15270 + }, + { + "epoch": 1.94262816435568, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6940851211547852, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8738670945167542, + "num_tokens": 582632503.0, + "step": 15271 + }, + { + "epoch": 1.9427553746342705, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.619336724281311, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8591476678848267, + "num_tokens": 582670312.0, + "step": 15272 + }, + { + "epoch": 1.9428825849128608, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6340749263763428, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8665671348571777, + "num_tokens": 582707502.0, + "step": 15273 + }, + { + "epoch": 1.9430097951914513, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5470385551452637, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8765389919281006, + "num_tokens": 582744575.0, + "step": 15274 + }, + { + "epoch": 1.9431370054700419, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5419310331344604, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8780257701873779, + "num_tokens": 582784664.0, + "step": 15275 + }, + { + "epoch": 1.9432642157486324, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.672788381576538, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8628857135772705, + "num_tokens": 582819394.0, + "step": 15276 + }, + { + "epoch": 1.943391426027223, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.867809534072876, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8807653188705444, + "num_tokens": 582851303.0, + "step": 15277 + }, + { + "epoch": 1.9435186363058135, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7303812503814697, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8568743467330933, + "num_tokens": 582888216.0, + "step": 15278 + }, + { + "epoch": 1.943645846584404, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6310298442840576, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8611015677452087, + "num_tokens": 582928898.0, + "step": 15279 + }, + { + "epoch": 1.9437730568629945, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5108789205551147, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8832188844680786, + "num_tokens": 582970507.0, + "step": 15280 + }, + { + "epoch": 1.943900267141585, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5660130977630615, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8817586302757263, + "num_tokens": 583009842.0, + "step": 15281 + }, + { + "epoch": 1.9440274774201756, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8106651306152344, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8775792121887207, + "num_tokens": 583044967.0, + "step": 15282 + }, + { + "epoch": 1.944154687698766, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4481428861618042, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8876486420631409, + "num_tokens": 583088786.0, + "step": 15283 + }, + { + "epoch": 1.9442818979773566, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4907703399658203, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8839834332466125, + "num_tokens": 583127290.0, + "step": 15284 + }, + { + "epoch": 1.9444091082559471, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6330757141113281, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8731852173805237, + "num_tokens": 583159766.0, + "step": 15285 + }, + { + "epoch": 1.9445363185345377, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6528087854385376, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8514895439147949, + "num_tokens": 583194112.0, + "step": 15286 + }, + { + "epoch": 1.9446635288131282, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.617879033088684, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8610119819641113, + "num_tokens": 583234035.0, + "step": 15287 + }, + { + "epoch": 1.9447907390917187, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5559686422348022, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8777451515197754, + "num_tokens": 583274999.0, + "step": 15288 + }, + { + "epoch": 1.9449179493703093, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6162800788879395, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8695542216300964, + "num_tokens": 583313571.0, + "step": 15289 + }, + { + "epoch": 1.9450451596488998, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5262829065322876, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8782627582550049, + "num_tokens": 583352039.0, + "step": 15290 + }, + { + "epoch": 1.94517236992749, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4979861974716187, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8622976541519165, + "num_tokens": 583395528.0, + "step": 15291 + }, + { + "epoch": 1.9452995802060806, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5913447141647339, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8746781349182129, + "num_tokens": 583431099.0, + "step": 15292 + }, + { + "epoch": 1.9454267904846712, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5832600593566895, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8792746663093567, + "num_tokens": 583467212.0, + "step": 15293 + }, + { + "epoch": 1.9455540007632617, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.578248143196106, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8790332078933716, + "num_tokens": 583505423.0, + "step": 15294 + }, + { + "epoch": 1.9456812110418522, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.843822717666626, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8559635877609253, + "num_tokens": 583540043.0, + "step": 15295 + }, + { + "epoch": 1.9458084213204427, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6489183902740479, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8685562610626221, + "num_tokens": 583579511.0, + "step": 15296 + }, + { + "epoch": 1.945935631599033, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6163592338562012, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8750370740890503, + "num_tokens": 583615346.0, + "step": 15297 + }, + { + "epoch": 1.9460628418776236, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6098400354385376, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8814722299575806, + "num_tokens": 583651336.0, + "step": 15298 + }, + { + "epoch": 1.946190052156214, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6721148490905762, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8716350793838501, + "num_tokens": 583690172.0, + "step": 15299 + }, + { + "epoch": 1.9463172624348046, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5034985542297363, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8629330396652222, + "num_tokens": 583734643.0, + "step": 15300 + }, + { + "epoch": 1.9464444727133952, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8251515626907349, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8473867177963257, + "num_tokens": 583766303.0, + "step": 15301 + }, + { + "epoch": 1.9465716829919857, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5065668821334839, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8727471232414246, + "num_tokens": 583805558.0, + "step": 15302 + }, + { + "epoch": 1.9466988932705762, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6832524538040161, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8522347211837769, + "num_tokens": 583846552.0, + "step": 15303 + }, + { + "epoch": 1.9468261035491667, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4981071949005127, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8698333501815796, + "num_tokens": 583885129.0, + "step": 15304 + }, + { + "epoch": 1.9469533138277573, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6670464277267456, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8569930791854858, + "num_tokens": 583922521.0, + "step": 15305 + }, + { + "epoch": 1.9470805241063478, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5559582710266113, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8854644894599915, + "num_tokens": 583958014.0, + "step": 15306 + }, + { + "epoch": 1.9472077343849383, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.581475853919983, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.885408341884613, + "num_tokens": 583993051.0, + "step": 15307 + }, + { + "epoch": 1.9473349446635289, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4893314838409424, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8610082864761353, + "num_tokens": 584039524.0, + "step": 15308 + }, + { + "epoch": 1.9474621549421194, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5386528968811035, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8817933797836304, + "num_tokens": 584078467.0, + "step": 15309 + }, + { + "epoch": 1.94758936522071, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5331467390060425, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.863805890083313, + "num_tokens": 584119451.0, + "step": 15310 + }, + { + "epoch": 1.9477165754993004, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6022534370422363, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8718613386154175, + "num_tokens": 584157128.0, + "step": 15311 + }, + { + "epoch": 1.947843785777891, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6219335794448853, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8759312033653259, + "num_tokens": 584191113.0, + "step": 15312 + }, + { + "epoch": 1.9479709960564815, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.563913345336914, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8694618344306946, + "num_tokens": 584230253.0, + "step": 15313 + }, + { + "epoch": 1.948098206335072, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7029534578323364, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8778345584869385, + "num_tokens": 584263470.0, + "step": 15314 + }, + { + "epoch": 1.9482254166136623, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.761744499206543, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8904067277908325, + "num_tokens": 584295674.0, + "step": 15315 + }, + { + "epoch": 1.9483526268922529, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5108414888381958, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8663618564605713, + "num_tokens": 584337276.0, + "step": 15316 + }, + { + "epoch": 1.9484798371708434, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5185853242874146, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8762972950935364, + "num_tokens": 584378255.0, + "step": 15317 + }, + { + "epoch": 1.948607047449434, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6757968664169312, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8567672967910767, + "num_tokens": 584414885.0, + "step": 15318 + }, + { + "epoch": 1.9487342577280244, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7437390089035034, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8700309991836548, + "num_tokens": 584446195.0, + "step": 15319 + }, + { + "epoch": 1.948861468006615, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.507088541984558, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8915318846702576, + "num_tokens": 584485330.0, + "step": 15320 + }, + { + "epoch": 1.9489886782852053, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5854378938674927, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8600573539733887, + "num_tokens": 584524959.0, + "step": 15321 + }, + { + "epoch": 1.9491158885637958, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6521543264389038, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8691332340240479, + "num_tokens": 584559426.0, + "step": 15322 + }, + { + "epoch": 1.9492430988423863, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5237635374069214, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8751475214958191, + "num_tokens": 584598649.0, + "step": 15323 + }, + { + "epoch": 1.9493703091209769, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6679316759109497, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.871752917766571, + "num_tokens": 584632428.0, + "step": 15324 + }, + { + "epoch": 1.9494975193995674, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5670514106750488, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8637640476226807, + "num_tokens": 584674144.0, + "step": 15325 + }, + { + "epoch": 1.949624729678158, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7663475275039673, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8634053468704224, + "num_tokens": 584709651.0, + "step": 15326 + }, + { + "epoch": 1.9497519399567484, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6367982625961304, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8653824329376221, + "num_tokens": 584746915.0, + "step": 15327 + }, + { + "epoch": 1.949879150235339, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6113804578781128, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8726800084114075, + "num_tokens": 584786316.0, + "step": 15328 + }, + { + "epoch": 1.9500063605139295, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6650912761688232, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8741931319236755, + "num_tokens": 584822092.0, + "step": 15329 + }, + { + "epoch": 1.95013357079252, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5606191158294678, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.868964433670044, + "num_tokens": 584861241.0, + "step": 15330 + }, + { + "epoch": 1.9502607810711106, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6244205236434937, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.889298677444458, + "num_tokens": 584895452.0, + "step": 15331 + }, + { + "epoch": 1.950387991349701, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6998802423477173, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8589699268341064, + "num_tokens": 584932598.0, + "step": 15332 + }, + { + "epoch": 1.9505152016282916, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7283918857574463, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8818126320838928, + "num_tokens": 584965465.0, + "step": 15333 + }, + { + "epoch": 1.9506424119068821, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5214744806289673, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8759628534317017, + "num_tokens": 585006649.0, + "step": 15334 + }, + { + "epoch": 1.9507696221854727, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.421482801437378, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8760939240455627, + "num_tokens": 585049534.0, + "step": 15335 + }, + { + "epoch": 1.9508968324640632, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5127805471420288, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8692503571510315, + "num_tokens": 585089756.0, + "step": 15336 + }, + { + "epoch": 1.9510240427426537, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5046337842941284, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8907464742660522, + "num_tokens": 585127225.0, + "step": 15337 + }, + { + "epoch": 1.9511512530212443, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7590696811676025, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.859602153301239, + "num_tokens": 585161005.0, + "step": 15338 + }, + { + "epoch": 1.9512784632998348, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7655946016311646, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8678703308105469, + "num_tokens": 585196830.0, + "step": 15339 + }, + { + "epoch": 1.951405673578425, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.574454665184021, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8663488626480103, + "num_tokens": 585236996.0, + "step": 15340 + }, + { + "epoch": 1.9515328838570156, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5084868669509888, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8749157786369324, + "num_tokens": 585279991.0, + "step": 15341 + }, + { + "epoch": 1.9516600941356061, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5351144075393677, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8825128078460693, + "num_tokens": 585320128.0, + "step": 15342 + }, + { + "epoch": 1.9517873044141967, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6616960763931274, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8793332576751709, + "num_tokens": 585354691.0, + "step": 15343 + }, + { + "epoch": 1.9519145146927872, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4915028810501099, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.886151134967804, + "num_tokens": 585396904.0, + "step": 15344 + }, + { + "epoch": 1.9520417249713777, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5775878429412842, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8856828212738037, + "num_tokens": 585435054.0, + "step": 15345 + }, + { + "epoch": 1.952168935249968, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.609705924987793, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8787974119186401, + "num_tokens": 585471238.0, + "step": 15346 + }, + { + "epoch": 1.9522961455285586, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8403313159942627, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8461885452270508, + "num_tokens": 585503410.0, + "step": 15347 + }, + { + "epoch": 1.952423355807149, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5193196535110474, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8863134980201721, + "num_tokens": 585543288.0, + "step": 15348 + }, + { + "epoch": 1.9525505660857396, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7500258684158325, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8747598528862, + "num_tokens": 585573675.0, + "step": 15349 + }, + { + "epoch": 1.9526777763643302, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5342152118682861, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8652124404907227, + "num_tokens": 585620439.0, + "step": 15350 + }, + { + "epoch": 1.9528049866429207, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.494591474533081, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8764017224311829, + "num_tokens": 585663123.0, + "step": 15351 + }, + { + "epoch": 1.9529321969215112, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5530577898025513, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8775743842124939, + "num_tokens": 585702986.0, + "step": 15352 + }, + { + "epoch": 1.9530594072001017, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6893043518066406, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8526989221572876, + "num_tokens": 585739665.0, + "step": 15353 + }, + { + "epoch": 1.9531866174786923, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5389994382858276, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8753532767295837, + "num_tokens": 585779558.0, + "step": 15354 + }, + { + "epoch": 1.9533138277572828, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5518925189971924, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8706597089767456, + "num_tokens": 585819330.0, + "step": 15355 + }, + { + "epoch": 1.9534410380358733, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4117038249969482, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.883275032043457, + "num_tokens": 585866076.0, + "step": 15356 + }, + { + "epoch": 1.9535682483144639, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5066365003585815, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8659750819206238, + "num_tokens": 585906199.0, + "step": 15357 + }, + { + "epoch": 1.9536954585930544, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4916037321090698, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8726402521133423, + "num_tokens": 585948523.0, + "step": 15358 + }, + { + "epoch": 1.953822668871645, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6339678764343262, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8769282102584839, + "num_tokens": 585984269.0, + "step": 15359 + }, + { + "epoch": 1.9539498791502354, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5564625263214111, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8738682866096497, + "num_tokens": 586025625.0, + "step": 15360 + }, + { + "epoch": 1.954077089428826, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7388999462127686, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8834549188613892, + "num_tokens": 586054808.0, + "step": 15361 + }, + { + "epoch": 1.9542042997074165, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7264231443405151, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8695018291473389, + "num_tokens": 586091808.0, + "step": 15362 + }, + { + "epoch": 1.954331509986007, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5769133567810059, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8888968825340271, + "num_tokens": 586128573.0, + "step": 15363 + }, + { + "epoch": 1.9544587202645973, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6317548751831055, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.878690242767334, + "num_tokens": 586162031.0, + "step": 15364 + }, + { + "epoch": 1.9545859305431879, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.724927306175232, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8738631010055542, + "num_tokens": 586195511.0, + "step": 15365 + }, + { + "epoch": 1.9547131408217784, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5608925819396973, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8576405048370361, + "num_tokens": 586237825.0, + "step": 15366 + }, + { + "epoch": 1.954840351100369, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5480399131774902, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8822633028030396, + "num_tokens": 586276537.0, + "step": 15367 + }, + { + "epoch": 1.9549675613789594, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.691657304763794, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8736438751220703, + "num_tokens": 586318420.0, + "step": 15368 + }, + { + "epoch": 1.95509477165755, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6271872520446777, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8808020353317261, + "num_tokens": 586355175.0, + "step": 15369 + }, + { + "epoch": 1.9552219819361403, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.665080189704895, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8684832453727722, + "num_tokens": 586392990.0, + "step": 15370 + }, + { + "epoch": 1.9553491922147308, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6476359367370605, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8747906684875488, + "num_tokens": 586431417.0, + "step": 15371 + }, + { + "epoch": 1.9554764024933213, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.684990406036377, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8775072693824768, + "num_tokens": 586468949.0, + "step": 15372 + }, + { + "epoch": 1.9556036127719119, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5880062580108643, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8959265947341919, + "num_tokens": 586501068.0, + "step": 15373 + }, + { + "epoch": 1.9557308230505024, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8056107759475708, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8689298629760742, + "num_tokens": 586533456.0, + "step": 15374 + }, + { + "epoch": 1.955858033329093, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5747321844100952, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8723431825637817, + "num_tokens": 586568932.0, + "step": 15375 + }, + { + "epoch": 1.9559852436076834, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5697691440582275, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8731251358985901, + "num_tokens": 586608168.0, + "step": 15376 + }, + { + "epoch": 1.956112453886274, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6785025596618652, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8498849868774414, + "num_tokens": 586646568.0, + "step": 15377 + }, + { + "epoch": 1.9562396641648645, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7174196243286133, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8695550560951233, + "num_tokens": 586678002.0, + "step": 15378 + }, + { + "epoch": 1.956366874443455, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6782939434051514, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8741402626037598, + "num_tokens": 586710869.0, + "step": 15379 + }, + { + "epoch": 1.9564940847220456, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6516677141189575, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8743684887886047, + "num_tokens": 586745729.0, + "step": 15380 + }, + { + "epoch": 1.956621295000636, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5852839946746826, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8897151947021484, + "num_tokens": 586779450.0, + "step": 15381 + }, + { + "epoch": 1.9567485052792266, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6905443668365479, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8703517913818359, + "num_tokens": 586816366.0, + "step": 15382 + }, + { + "epoch": 1.9568757155578171, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5347442626953125, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8697737455368042, + "num_tokens": 586857287.0, + "step": 15383 + }, + { + "epoch": 1.9570029258364077, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6123669147491455, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8770545721054077, + "num_tokens": 586891646.0, + "step": 15384 + }, + { + "epoch": 1.9571301361149982, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7469009160995483, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8599035739898682, + "num_tokens": 586926925.0, + "step": 15385 + }, + { + "epoch": 1.9572573463935887, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5764451026916504, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8769099116325378, + "num_tokens": 586963130.0, + "step": 15386 + }, + { + "epoch": 1.9573845566721793, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5586416721343994, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8568379282951355, + "num_tokens": 587001697.0, + "step": 15387 + }, + { + "epoch": 1.9575117669507698, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5885424613952637, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8560557961463928, + "num_tokens": 587044581.0, + "step": 15388 + }, + { + "epoch": 1.95763897722936, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4985705614089966, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8702164888381958, + "num_tokens": 587085576.0, + "step": 15389 + }, + { + "epoch": 1.9577661875079506, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6142750978469849, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8784540295600891, + "num_tokens": 587118952.0, + "step": 15390 + }, + { + "epoch": 1.9578933977865411, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.79220712184906, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8661788702011108, + "num_tokens": 587149302.0, + "step": 15391 + }, + { + "epoch": 1.9580206080651317, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4900180101394653, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8732978105545044, + "num_tokens": 587193676.0, + "step": 15392 + }, + { + "epoch": 1.9581478183437222, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5196393728256226, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8927344083786011, + "num_tokens": 587228286.0, + "step": 15393 + }, + { + "epoch": 1.9582750286223127, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6424510478973389, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8767141699790955, + "num_tokens": 587263110.0, + "step": 15394 + }, + { + "epoch": 1.958402238900903, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.495237946510315, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8678336143493652, + "num_tokens": 587305630.0, + "step": 15395 + }, + { + "epoch": 1.9585294491794936, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5246210098266602, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8805748224258423, + "num_tokens": 587341977.0, + "step": 15396 + }, + { + "epoch": 1.958656659458084, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.693025827407837, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8882665038108826, + "num_tokens": 587372402.0, + "step": 15397 + }, + { + "epoch": 1.9587838697366746, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.51120126247406, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8678911924362183, + "num_tokens": 587415279.0, + "step": 15398 + }, + { + "epoch": 1.9589110800152651, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5856326818466187, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8694123029708862, + "num_tokens": 587451225.0, + "step": 15399 + }, + { + "epoch": 1.9590382902938557, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.649275779724121, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.876681923866272, + "num_tokens": 587484408.0, + "step": 15400 + }, + { + "epoch": 1.9591655005724462, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5458872318267822, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8699648380279541, + "num_tokens": 587525845.0, + "step": 15401 + }, + { + "epoch": 1.9592927108510367, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.653501033782959, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8673725128173828, + "num_tokens": 587560893.0, + "step": 15402 + }, + { + "epoch": 1.9594199211296273, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.623887538909912, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8677825331687927, + "num_tokens": 587599793.0, + "step": 15403 + }, + { + "epoch": 1.9595471314082178, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6742417812347412, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.881885826587677, + "num_tokens": 587632806.0, + "step": 15404 + }, + { + "epoch": 1.9596743416868083, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7235658168792725, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8743903636932373, + "num_tokens": 587665722.0, + "step": 15405 + }, + { + "epoch": 1.9598015519653988, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7296693325042725, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8688859939575195, + "num_tokens": 587699541.0, + "step": 15406 + }, + { + "epoch": 1.9599287622439894, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4948234558105469, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.870266854763031, + "num_tokens": 587741755.0, + "step": 15407 + }, + { + "epoch": 1.96005597252258, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5122226476669312, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8761535882949829, + "num_tokens": 587780924.0, + "step": 15408 + }, + { + "epoch": 1.9601831828011704, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5425359010696411, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.877638041973114, + "num_tokens": 587819143.0, + "step": 15409 + }, + { + "epoch": 1.960310393079761, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5004771947860718, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8535224795341492, + "num_tokens": 587866902.0, + "step": 15410 + }, + { + "epoch": 1.9604376033583515, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6019432544708252, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8561774492263794, + "num_tokens": 587909621.0, + "step": 15411 + }, + { + "epoch": 1.960564813636942, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6530333757400513, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8719098567962646, + "num_tokens": 587948573.0, + "step": 15412 + }, + { + "epoch": 1.9606920239155323, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5216165781021118, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8681411147117615, + "num_tokens": 587992755.0, + "step": 15413 + }, + { + "epoch": 1.9608192341941229, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6816715002059937, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8688371777534485, + "num_tokens": 588027466.0, + "step": 15414 + }, + { + "epoch": 1.9609464444727134, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5902624130249023, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.877393364906311, + "num_tokens": 588064660.0, + "step": 15415 + }, + { + "epoch": 1.961073654751304, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5725276470184326, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8690953254699707, + "num_tokens": 588106869.0, + "step": 15416 + }, + { + "epoch": 1.9612008650298944, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.681060552597046, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8709933161735535, + "num_tokens": 588141545.0, + "step": 15417 + }, + { + "epoch": 1.961328075308485, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5069454908370972, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8815125226974487, + "num_tokens": 588180655.0, + "step": 15418 + }, + { + "epoch": 1.9614552855870753, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5834754705429077, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8871495723724365, + "num_tokens": 588213453.0, + "step": 15419 + }, + { + "epoch": 1.9615824958656658, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5388890504837036, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8725930452346802, + "num_tokens": 588253389.0, + "step": 15420 + }, + { + "epoch": 1.9617097061442563, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6574145555496216, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8603153228759766, + "num_tokens": 588293088.0, + "step": 15421 + }, + { + "epoch": 1.9618369164228469, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7316596508026123, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8599666357040405, + "num_tokens": 588327853.0, + "step": 15422 + }, + { + "epoch": 1.9619641267014374, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.672603726387024, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8698469996452332, + "num_tokens": 588360079.0, + "step": 15423 + }, + { + "epoch": 1.962091336980028, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5176242589950562, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8662994503974915, + "num_tokens": 588404617.0, + "step": 15424 + }, + { + "epoch": 1.9622185472586184, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6217116117477417, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8697432279586792, + "num_tokens": 588445043.0, + "step": 15425 + }, + { + "epoch": 1.962345757537209, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6126039028167725, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8650331497192383, + "num_tokens": 588486125.0, + "step": 15426 + }, + { + "epoch": 1.9624729678157995, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5701053142547607, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.872143030166626, + "num_tokens": 588523492.0, + "step": 15427 + }, + { + "epoch": 1.96260017809439, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4380401372909546, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8795838952064514, + "num_tokens": 588567307.0, + "step": 15428 + }, + { + "epoch": 1.9627273883729806, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6425074338912964, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8830125331878662, + "num_tokens": 588601959.0, + "step": 15429 + }, + { + "epoch": 1.962854598651571, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6623765230178833, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8719782829284668, + "num_tokens": 588638020.0, + "step": 15430 + }, + { + "epoch": 1.9629818089301616, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4936729669570923, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8844072818756104, + "num_tokens": 588677505.0, + "step": 15431 + }, + { + "epoch": 1.9631090192087521, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5733470916748047, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8771458864212036, + "num_tokens": 588718735.0, + "step": 15432 + }, + { + "epoch": 1.9632362294873427, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5649878978729248, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8727887272834778, + "num_tokens": 588761096.0, + "step": 15433 + }, + { + "epoch": 1.9633634397659332, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.668275237083435, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8716825246810913, + "num_tokens": 588795888.0, + "step": 15434 + }, + { + "epoch": 1.9634906500445237, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4563883543014526, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8737242817878723, + "num_tokens": 588838115.0, + "step": 15435 + }, + { + "epoch": 1.9636178603231143, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4778140783309937, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8788451552391052, + "num_tokens": 588879944.0, + "step": 15436 + }, + { + "epoch": 1.9637450706017048, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5000594854354858, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8502645492553711, + "num_tokens": 588925579.0, + "step": 15437 + }, + { + "epoch": 1.963872280880295, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5766003131866455, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8730928897857666, + "num_tokens": 588963263.0, + "step": 15438 + }, + { + "epoch": 1.9639994911588856, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 2.029825210571289, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.873762845993042, + "num_tokens": 588995019.0, + "step": 15439 + }, + { + "epoch": 1.9641267014374761, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5457549095153809, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8760855793952942, + "num_tokens": 589031576.0, + "step": 15440 + }, + { + "epoch": 1.9642539117160667, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.584800362586975, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8721795082092285, + "num_tokens": 589067597.0, + "step": 15441 + }, + { + "epoch": 1.9643811219946572, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5412043333053589, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8904035687446594, + "num_tokens": 589104766.0, + "step": 15442 + }, + { + "epoch": 1.9645083322732477, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7377625703811646, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8855214715003967, + "num_tokens": 589136237.0, + "step": 15443 + }, + { + "epoch": 1.964635542551838, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6046788692474365, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8644000887870789, + "num_tokens": 589177184.0, + "step": 15444 + }, + { + "epoch": 1.9647627528304286, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5285855531692505, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8796951770782471, + "num_tokens": 589214874.0, + "step": 15445 + }, + { + "epoch": 1.964889963109019, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5861274003982544, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8693656921386719, + "num_tokens": 589252842.0, + "step": 15446 + }, + { + "epoch": 1.9650171733876096, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.68590247631073, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8482911586761475, + "num_tokens": 589289288.0, + "step": 15447 + }, + { + "epoch": 1.9651443836662001, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.616607666015625, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8708416223526001, + "num_tokens": 589330234.0, + "step": 15448 + }, + { + "epoch": 1.9652715939447907, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4376224279403687, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8837139010429382, + "num_tokens": 589370460.0, + "step": 15449 + }, + { + "epoch": 1.9653988042233812, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4965966939926147, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8682554960250854, + "num_tokens": 589412582.0, + "step": 15450 + }, + { + "epoch": 1.9655260145019717, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6079370975494385, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8669257760047913, + "num_tokens": 589451472.0, + "step": 15451 + }, + { + "epoch": 1.9656532247805623, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.581883192062378, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.877498209476471, + "num_tokens": 589489022.0, + "step": 15452 + }, + { + "epoch": 1.9657804350591528, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5121482610702515, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8796331286430359, + "num_tokens": 589527449.0, + "step": 15453 + }, + { + "epoch": 1.9659076453377433, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5511717796325684, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8752729296684265, + "num_tokens": 589565299.0, + "step": 15454 + }, + { + "epoch": 1.9660348556163338, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.697546362876892, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8555124998092651, + "num_tokens": 589601119.0, + "step": 15455 + }, + { + "epoch": 1.9661620658949244, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7240338325500488, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8614317178726196, + "num_tokens": 589636765.0, + "step": 15456 + }, + { + "epoch": 1.966289276173515, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6792397499084473, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8545109033584595, + "num_tokens": 589673536.0, + "step": 15457 + }, + { + "epoch": 1.9664164864521054, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6955316066741943, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8710101842880249, + "num_tokens": 589705312.0, + "step": 15458 + }, + { + "epoch": 1.966543696730696, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5407227277755737, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8829968571662903, + "num_tokens": 589743501.0, + "step": 15459 + }, + { + "epoch": 1.9666709070092865, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4809452295303345, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8790738582611084, + "num_tokens": 589782444.0, + "step": 15460 + }, + { + "epoch": 1.966798117287877, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6436548233032227, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.870956540107727, + "num_tokens": 589818929.0, + "step": 15461 + }, + { + "epoch": 1.9669253275664673, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.73915696144104, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8740170001983643, + "num_tokens": 589852242.0, + "step": 15462 + }, + { + "epoch": 1.9670525378450578, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.879379153251648, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8745484948158264, + "num_tokens": 589885408.0, + "step": 15463 + }, + { + "epoch": 1.9671797481236484, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6042715311050415, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8851535320281982, + "num_tokens": 589919810.0, + "step": 15464 + }, + { + "epoch": 1.967306958402239, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5267987251281738, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8721879124641418, + "num_tokens": 589962228.0, + "step": 15465 + }, + { + "epoch": 1.9674341686808294, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.477433443069458, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8923380970954895, + "num_tokens": 590003403.0, + "step": 15466 + }, + { + "epoch": 1.96756137895942, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7098785638809204, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8754574060440063, + "num_tokens": 590039264.0, + "step": 15467 + }, + { + "epoch": 1.9676885892380103, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7302117347717285, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8588225841522217, + "num_tokens": 590075511.0, + "step": 15468 + }, + { + "epoch": 1.9678157995166008, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.610015869140625, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8782131671905518, + "num_tokens": 590111649.0, + "step": 15469 + }, + { + "epoch": 1.9679430097951913, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5402748584747314, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8839802742004395, + "num_tokens": 590151614.0, + "step": 15470 + }, + { + "epoch": 1.9680702200737819, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6702884435653687, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8778907656669617, + "num_tokens": 590188528.0, + "step": 15471 + }, + { + "epoch": 1.9681974303523724, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6281123161315918, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8852227926254272, + "num_tokens": 590220805.0, + "step": 15472 + }, + { + "epoch": 1.968324640630963, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6971591711044312, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8806695342063904, + "num_tokens": 590254272.0, + "step": 15473 + }, + { + "epoch": 1.9684518509095534, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7006373405456543, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8729194402694702, + "num_tokens": 590288277.0, + "step": 15474 + }, + { + "epoch": 1.968579061188144, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6599011421203613, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.869789183139801, + "num_tokens": 590324298.0, + "step": 15475 + }, + { + "epoch": 1.9687062714667345, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.487176775932312, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8846290111541748, + "num_tokens": 590365669.0, + "step": 15476 + }, + { + "epoch": 1.968833481745325, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5489797592163086, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8681129217147827, + "num_tokens": 590402809.0, + "step": 15477 + }, + { + "epoch": 1.9689606920239155, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5600299835205078, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.875664234161377, + "num_tokens": 590441578.0, + "step": 15478 + }, + { + "epoch": 1.969087902302506, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6165249347686768, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8723849058151245, + "num_tokens": 590477958.0, + "step": 15479 + }, + { + "epoch": 1.9692151125810966, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7948408126831055, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8586461544036865, + "num_tokens": 590511366.0, + "step": 15480 + }, + { + "epoch": 1.9693423228596871, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.580280065536499, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8771494626998901, + "num_tokens": 590549312.0, + "step": 15481 + }, + { + "epoch": 1.9694695331382777, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5517866611480713, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8756262063980103, + "num_tokens": 590587159.0, + "step": 15482 + }, + { + "epoch": 1.9695967434168682, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4575819969177246, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8764730095863342, + "num_tokens": 590627977.0, + "step": 15483 + }, + { + "epoch": 1.9697239536954587, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.552140474319458, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8775213956832886, + "num_tokens": 590667837.0, + "step": 15484 + }, + { + "epoch": 1.9698511639740492, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.57613205909729, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8727421760559082, + "num_tokens": 590707652.0, + "step": 15485 + }, + { + "epoch": 1.9699783742526398, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.661506175994873, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8617810606956482, + "num_tokens": 590745974.0, + "step": 15486 + }, + { + "epoch": 1.97010558453123, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7574443817138672, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8708471059799194, + "num_tokens": 590780566.0, + "step": 15487 + }, + { + "epoch": 1.9702327948098206, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.506954312324524, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8676379919052124, + "num_tokens": 590821673.0, + "step": 15488 + }, + { + "epoch": 1.9703600050884111, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6667729616165161, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.850813627243042, + "num_tokens": 590862443.0, + "step": 15489 + }, + { + "epoch": 1.9704872153670017, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5549463033676147, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8855384588241577, + "num_tokens": 590895830.0, + "step": 15490 + }, + { + "epoch": 1.9706144256455922, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4875091314315796, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8593097925186157, + "num_tokens": 590941023.0, + "step": 15491 + }, + { + "epoch": 1.9707416359241827, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.742246389389038, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8630334138870239, + "num_tokens": 590977593.0, + "step": 15492 + }, + { + "epoch": 1.970868846202773, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6640279293060303, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8755288124084473, + "num_tokens": 591014898.0, + "step": 15493 + }, + { + "epoch": 1.9709960564813636, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5577893257141113, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8805757761001587, + "num_tokens": 591051493.0, + "step": 15494 + }, + { + "epoch": 1.971123266759954, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5220084190368652, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8842270374298096, + "num_tokens": 591091769.0, + "step": 15495 + }, + { + "epoch": 1.9712504770385446, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8624627590179443, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.8419564962387085, + "num_tokens": 591124699.0, + "step": 15496 + }, + { + "epoch": 1.9713776873171351, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6524012088775635, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8768350481987, + "num_tokens": 591159618.0, + "step": 15497 + }, + { + "epoch": 1.9715048975957257, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4800662994384766, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8766016364097595, + "num_tokens": 591199954.0, + "step": 15498 + }, + { + "epoch": 1.9716321078743162, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6142277717590332, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8698240518569946, + "num_tokens": 591240196.0, + "step": 15499 + }, + { + "epoch": 1.9717593181529067, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6126095056533813, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8610821962356567, + "num_tokens": 591279696.0, + "step": 15500 + }, + { + "epoch": 1.9718865284314973, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 2.269765615463257, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8579429984092712, + "num_tokens": 591312815.0, + "step": 15501 + }, + { + "epoch": 1.9720137387100878, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.704172134399414, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8720601201057434, + "num_tokens": 591345225.0, + "step": 15502 + }, + { + "epoch": 1.9721409489886783, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.795408010482788, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8600707650184631, + "num_tokens": 591376083.0, + "step": 15503 + }, + { + "epoch": 1.9722681592672688, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6721464395523071, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8566315174102783, + "num_tokens": 591417920.0, + "step": 15504 + }, + { + "epoch": 1.9723953695458594, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4944123029708862, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8596693277359009, + "num_tokens": 591461720.0, + "step": 15505 + }, + { + "epoch": 1.97252257982445, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4370217323303223, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8801023364067078, + "num_tokens": 591500565.0, + "step": 15506 + }, + { + "epoch": 1.9726497901030404, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.50499427318573, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8826513886451721, + "num_tokens": 591537296.0, + "step": 15507 + }, + { + "epoch": 1.972777000381631, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.4200468063354492, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.8874102830886841, + "num_tokens": 591577487.0, + "step": 15508 + }, + { + "epoch": 1.9729042106602215, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6581966876983643, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8834018707275391, + "num_tokens": 591611831.0, + "step": 15509 + }, + { + "epoch": 1.973031420938812, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.668913722038269, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8806498050689697, + "num_tokens": 591642769.0, + "step": 15510 + }, + { + "epoch": 1.9731586312174023, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5531572103500366, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8811503648757935, + "num_tokens": 591681084.0, + "step": 15511 + }, + { + "epoch": 1.9732858414959928, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6456573009490967, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8754003643989563, + "num_tokens": 591716451.0, + "step": 15512 + }, + { + "epoch": 1.9734130517745834, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6819647550582886, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8654240369796753, + "num_tokens": 591753427.0, + "step": 15513 + }, + { + "epoch": 1.973540262053174, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6589126586914062, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8844073414802551, + "num_tokens": 591787645.0, + "step": 15514 + }, + { + "epoch": 1.9736674723317644, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.8652148246765137, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.870963454246521, + "num_tokens": 591821729.0, + "step": 15515 + }, + { + "epoch": 1.973794682610355, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5371190309524536, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8693679571151733, + "num_tokens": 591859737.0, + "step": 15516 + }, + { + "epoch": 1.9739218928889453, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.856954574584961, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8696510791778564, + "num_tokens": 591891481.0, + "step": 15517 + }, + { + "epoch": 1.9740491031675358, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.7016273736953735, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8750734925270081, + "num_tokens": 591924530.0, + "step": 15518 + }, + { + "epoch": 1.9741763134461263, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6084818840026855, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8653049468994141, + "num_tokens": 591962524.0, + "step": 15519 + }, + { + "epoch": 1.9743035237247168, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5316270589828491, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8757699728012085, + "num_tokens": 592006267.0, + "step": 15520 + }, + { + "epoch": 1.9744307340033074, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.635654091835022, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8837867975234985, + "num_tokens": 592041253.0, + "step": 15521 + }, + { + "epoch": 1.974557944281898, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6966347694396973, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8769855499267578, + "num_tokens": 592073291.0, + "step": 15522 + }, + { + "epoch": 1.9746851545604884, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6840291023254395, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.845797598361969, + "num_tokens": 592114513.0, + "step": 15523 + }, + { + "epoch": 1.974812364839079, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.617925763130188, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.864582359790802, + "num_tokens": 592155914.0, + "step": 15524 + }, + { + "epoch": 1.9749395751176695, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.5545157194137573, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8630489110946655, + "num_tokens": 592198552.0, + "step": 15525 + }, + { + "epoch": 1.97506678539626, + "ewc_loss": 2.491474151611328e-05, + "grad_norm": 1.6079893112182617, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8560829162597656, + "num_tokens": 592240156.0, + "step": 15526 + }, + { + "epoch": 1.9751939956748505, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6019669771194458, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8615247011184692, + "num_tokens": 592278733.0, + "step": 15527 + }, + { + "epoch": 1.975321205953441, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5467215776443481, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8752687573432922, + "num_tokens": 592316005.0, + "step": 15528 + }, + { + "epoch": 1.9754484162320316, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.752890706062317, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8632089495658875, + "num_tokens": 592348873.0, + "step": 15529 + }, + { + "epoch": 1.9755756265106221, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6725819110870361, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8590972423553467, + "num_tokens": 592385681.0, + "step": 15530 + }, + { + "epoch": 1.9757028367892127, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7014626264572144, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.85895836353302, + "num_tokens": 592419944.0, + "step": 15531 + }, + { + "epoch": 1.9758300470678032, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6479274034500122, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8736558556556702, + "num_tokens": 592455832.0, + "step": 15532 + }, + { + "epoch": 1.9759572573463937, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5110862255096436, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8772764205932617, + "num_tokens": 592494478.0, + "step": 15533 + }, + { + "epoch": 1.9760844676249842, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6988734006881714, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.86965012550354, + "num_tokens": 592534827.0, + "step": 15534 + }, + { + "epoch": 1.9762116779035748, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6120686531066895, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8790698051452637, + "num_tokens": 592571226.0, + "step": 15535 + }, + { + "epoch": 1.976338888182165, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5882294178009033, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.877057671546936, + "num_tokens": 592607313.0, + "step": 15536 + }, + { + "epoch": 1.9764660984607556, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6805833578109741, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8635614514350891, + "num_tokens": 592642188.0, + "step": 15537 + }, + { + "epoch": 1.9765933087393461, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5130810737609863, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.879355251789093, + "num_tokens": 592683305.0, + "step": 15538 + }, + { + "epoch": 1.9767205190179367, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.504218578338623, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.876602292060852, + "num_tokens": 592720899.0, + "step": 15539 + }, + { + "epoch": 1.9768477292965272, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.644416093826294, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8729720711708069, + "num_tokens": 592755167.0, + "step": 15540 + }, + { + "epoch": 1.9769749395751177, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6462650299072266, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8661990761756897, + "num_tokens": 592790516.0, + "step": 15541 + }, + { + "epoch": 1.977102149853708, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.595617413520813, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8840184807777405, + "num_tokens": 592825408.0, + "step": 15542 + }, + { + "epoch": 1.9772293601322986, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.571499228477478, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8636474013328552, + "num_tokens": 592866376.0, + "step": 15543 + }, + { + "epoch": 1.977356570410889, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.578487515449524, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.87346351146698, + "num_tokens": 592903884.0, + "step": 15544 + }, + { + "epoch": 1.9774837806894796, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6155304908752441, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8550174832344055, + "num_tokens": 592945151.0, + "step": 15545 + }, + { + "epoch": 1.9776109909680701, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.4330395460128784, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8933349251747131, + "num_tokens": 592982736.0, + "step": 15546 + }, + { + "epoch": 1.9777382012466607, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5357379913330078, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8738499879837036, + "num_tokens": 593024452.0, + "step": 15547 + }, + { + "epoch": 1.9778654115252512, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5583205223083496, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8788869976997375, + "num_tokens": 593065769.0, + "step": 15548 + }, + { + "epoch": 1.9779926218038417, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7669554948806763, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8643547296524048, + "num_tokens": 593099557.0, + "step": 15549 + }, + { + "epoch": 1.9781198320824323, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5597116947174072, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.862707257270813, + "num_tokens": 593144911.0, + "step": 15550 + }, + { + "epoch": 1.9782470423610228, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6493644714355469, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8866997361183167, + "num_tokens": 593181827.0, + "step": 15551 + }, + { + "epoch": 1.9783742526396133, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5048377513885498, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8832622766494751, + "num_tokens": 593222862.0, + "step": 15552 + }, + { + "epoch": 1.9785014629182038, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4540797472000122, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8767317533493042, + "num_tokens": 593265120.0, + "step": 15553 + }, + { + "epoch": 1.9786286731967944, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4987350702285767, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8760185241699219, + "num_tokens": 593303923.0, + "step": 15554 + }, + { + "epoch": 1.978755883475385, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5486818552017212, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8718849420547485, + "num_tokens": 593346663.0, + "step": 15555 + }, + { + "epoch": 1.9788830937539754, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.496505618095398, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8728938102722168, + "num_tokens": 593387645.0, + "step": 15556 + }, + { + "epoch": 1.979010304032566, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5935673713684082, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.852187991142273, + "num_tokens": 593431179.0, + "step": 15557 + }, + { + "epoch": 1.9791375143111565, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7127442359924316, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8565155267715454, + "num_tokens": 593467803.0, + "step": 15558 + }, + { + "epoch": 1.979264724589747, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.659165859222412, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.869107723236084, + "num_tokens": 593501765.0, + "step": 15559 + }, + { + "epoch": 1.9793919348683373, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6957099437713623, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8669890761375427, + "num_tokens": 593539905.0, + "step": 15560 + }, + { + "epoch": 1.9795191451469278, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.605877161026001, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8649740219116211, + "num_tokens": 593575848.0, + "step": 15561 + }, + { + "epoch": 1.9796463554255184, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6227812767028809, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8670237064361572, + "num_tokens": 593616873.0, + "step": 15562 + }, + { + "epoch": 1.979773565704109, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.910618782043457, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8547433614730835, + "num_tokens": 593652093.0, + "step": 15563 + }, + { + "epoch": 1.9799007759826994, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5476711988449097, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8605716228485107, + "num_tokens": 593694145.0, + "step": 15564 + }, + { + "epoch": 1.98002798626129, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.538528323173523, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.865669310092926, + "num_tokens": 593736126.0, + "step": 15565 + }, + { + "epoch": 1.9801551965398803, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5635052919387817, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8756421804428101, + "num_tokens": 593772289.0, + "step": 15566 + }, + { + "epoch": 1.9802824068184708, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4771267175674438, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8780708312988281, + "num_tokens": 593816526.0, + "step": 15567 + }, + { + "epoch": 1.9804096170970613, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5661485195159912, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8645532727241516, + "num_tokens": 593856499.0, + "step": 15568 + }, + { + "epoch": 1.9805368273756518, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5527907609939575, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8781327605247498, + "num_tokens": 593896024.0, + "step": 15569 + }, + { + "epoch": 1.9806640376542424, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4958364963531494, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8890955448150635, + "num_tokens": 593936508.0, + "step": 15570 + }, + { + "epoch": 1.980791247932833, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4223034381866455, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8775545358657837, + "num_tokens": 593984176.0, + "step": 15571 + }, + { + "epoch": 1.9809184582114234, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.629196286201477, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8714845776557922, + "num_tokens": 594024368.0, + "step": 15572 + }, + { + "epoch": 1.981045668490014, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6057748794555664, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8684818744659424, + "num_tokens": 594062533.0, + "step": 15573 + }, + { + "epoch": 1.9811728787686045, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5929861068725586, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8774036169052124, + "num_tokens": 594097039.0, + "step": 15574 + }, + { + "epoch": 1.981300089047195, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5352015495300293, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8699034452438354, + "num_tokens": 594137426.0, + "step": 15575 + }, + { + "epoch": 1.9814272993257855, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5192252397537231, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8804692029953003, + "num_tokens": 594173433.0, + "step": 15576 + }, + { + "epoch": 1.981554509604376, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5597225427627563, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8728309869766235, + "num_tokens": 594213864.0, + "step": 15577 + }, + { + "epoch": 1.9816817198829666, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7644351720809937, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.850835919380188, + "num_tokens": 594248050.0, + "step": 15578 + }, + { + "epoch": 1.9818089301615571, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.749906301498413, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8579500913619995, + "num_tokens": 594282686.0, + "step": 15579 + }, + { + "epoch": 1.9819361404401477, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.711683988571167, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.867878258228302, + "num_tokens": 594318355.0, + "step": 15580 + }, + { + "epoch": 1.9820633507187382, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.533409833908081, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8741955161094666, + "num_tokens": 594362117.0, + "step": 15581 + }, + { + "epoch": 1.9821905609973287, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.626308798789978, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8890158534049988, + "num_tokens": 594397749.0, + "step": 15582 + }, + { + "epoch": 1.9823177712759192, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5448698997497559, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8757030963897705, + "num_tokens": 594435518.0, + "step": 15583 + }, + { + "epoch": 1.9824449815545098, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6147409677505493, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.881316602230072, + "num_tokens": 594469362.0, + "step": 15584 + }, + { + "epoch": 1.9825721918331, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5513030290603638, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8731163740158081, + "num_tokens": 594506661.0, + "step": 15585 + }, + { + "epoch": 1.9826994021116906, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7407653331756592, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8628138899803162, + "num_tokens": 594547151.0, + "step": 15586 + }, + { + "epoch": 1.9828266123902811, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4045264720916748, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.876664400100708, + "num_tokens": 594595142.0, + "step": 15587 + }, + { + "epoch": 1.9829538226688717, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.606482982635498, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8712814450263977, + "num_tokens": 594631769.0, + "step": 15588 + }, + { + "epoch": 1.9830810329474622, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4956928491592407, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8658133745193481, + "num_tokens": 594676407.0, + "step": 15589 + }, + { + "epoch": 1.9832082432260527, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5696814060211182, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8716587424278259, + "num_tokens": 594718687.0, + "step": 15590 + }, + { + "epoch": 1.983335453504643, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.645131230354309, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8782510161399841, + "num_tokens": 594759386.0, + "step": 15591 + }, + { + "epoch": 1.9834626637832335, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5063354969024658, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8702837824821472, + "num_tokens": 594800416.0, + "step": 15592 + }, + { + "epoch": 1.983589874061824, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5023411512374878, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8800340890884399, + "num_tokens": 594836482.0, + "step": 15593 + }, + { + "epoch": 1.9837170843404146, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.541847586631775, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8833326697349548, + "num_tokens": 594872930.0, + "step": 15594 + }, + { + "epoch": 1.9838442946190051, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5553721189498901, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8770022988319397, + "num_tokens": 594913405.0, + "step": 15595 + }, + { + "epoch": 1.9839715048975957, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6207022666931152, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.867296576499939, + "num_tokens": 594949583.0, + "step": 15596 + }, + { + "epoch": 1.9840987151761862, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6957967281341553, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8502480983734131, + "num_tokens": 594989420.0, + "step": 15597 + }, + { + "epoch": 1.9842259254547767, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.465724229812622, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8780836462974548, + "num_tokens": 595033849.0, + "step": 15598 + }, + { + "epoch": 1.9843531357333672, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5652990341186523, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8734302520751953, + "num_tokens": 595074539.0, + "step": 15599 + }, + { + "epoch": 1.9844803460119578, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4889565706253052, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8739069700241089, + "num_tokens": 595115649.0, + "step": 15600 + }, + { + "epoch": 1.9846075562905483, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7143652439117432, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.880214273929596, + "num_tokens": 595147273.0, + "step": 15601 + }, + { + "epoch": 1.9847347665691388, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5847913026809692, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8719002604484558, + "num_tokens": 595184672.0, + "step": 15602 + }, + { + "epoch": 1.9848619768477294, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6539301872253418, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8703691959381104, + "num_tokens": 595225247.0, + "step": 15603 + }, + { + "epoch": 1.9849891871263199, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6191898584365845, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8667020797729492, + "num_tokens": 595263575.0, + "step": 15604 + }, + { + "epoch": 1.9851163974049104, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4385024309158325, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8780552744865417, + "num_tokens": 595306672.0, + "step": 15605 + }, + { + "epoch": 1.985243607683501, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.643112301826477, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8825870752334595, + "num_tokens": 595340424.0, + "step": 15606 + }, + { + "epoch": 1.9853708179620915, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 16.577383041381836, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8758995532989502, + "num_tokens": 595381409.0, + "step": 15607 + }, + { + "epoch": 1.985498028240682, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6408010721206665, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8866438269615173, + "num_tokens": 595416815.0, + "step": 15608 + }, + { + "epoch": 1.9856252385192723, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5317453145980835, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8782929182052612, + "num_tokens": 595454151.0, + "step": 15609 + }, + { + "epoch": 1.9857524487978628, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5271406173706055, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8879786729812622, + "num_tokens": 595491409.0, + "step": 15610 + }, + { + "epoch": 1.9858796590764534, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6076756715774536, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8639132976531982, + "num_tokens": 595532313.0, + "step": 15611 + }, + { + "epoch": 1.986006869355044, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5706400871276855, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8707981109619141, + "num_tokens": 595568546.0, + "step": 15612 + }, + { + "epoch": 1.9861340796336344, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.801431655883789, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8545383214950562, + "num_tokens": 595604321.0, + "step": 15613 + }, + { + "epoch": 1.986261289912225, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5296156406402588, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8663346767425537, + "num_tokens": 595645054.0, + "step": 15614 + }, + { + "epoch": 1.9863885001908153, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6606563329696655, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8718510866165161, + "num_tokens": 595682402.0, + "step": 15615 + }, + { + "epoch": 1.9865157104694058, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6526949405670166, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8618720769882202, + "num_tokens": 595721030.0, + "step": 15616 + }, + { + "epoch": 1.9866429207479963, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.473883867263794, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8893358707427979, + "num_tokens": 595762408.0, + "step": 15617 + }, + { + "epoch": 1.9867701310265868, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.616253137588501, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.868110716342926, + "num_tokens": 595801710.0, + "step": 15618 + }, + { + "epoch": 1.9868973413051774, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5698692798614502, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8763126730918884, + "num_tokens": 595840441.0, + "step": 15619 + }, + { + "epoch": 1.987024551583768, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.589828610420227, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8760516047477722, + "num_tokens": 595874402.0, + "step": 15620 + }, + { + "epoch": 1.9871517618623584, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6558582782745361, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8510020971298218, + "num_tokens": 595911017.0, + "step": 15621 + }, + { + "epoch": 1.987278972140949, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5969709157943726, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8765459060668945, + "num_tokens": 595947792.0, + "step": 15622 + }, + { + "epoch": 1.9874061824195395, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5593020915985107, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8622093796730042, + "num_tokens": 595986354.0, + "step": 15623 + }, + { + "epoch": 1.98753339269813, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5849285125732422, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8864907622337341, + "num_tokens": 596021360.0, + "step": 15624 + }, + { + "epoch": 1.9876606029767205, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5536365509033203, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.883025050163269, + "num_tokens": 596057897.0, + "step": 15625 + }, + { + "epoch": 1.987787813255311, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5624831914901733, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8760212063789368, + "num_tokens": 596097701.0, + "step": 15626 + }, + { + "epoch": 1.9879150235339016, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6809231042861938, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8609342575073242, + "num_tokens": 596134642.0, + "step": 15627 + }, + { + "epoch": 1.9880422338124921, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5890949964523315, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8786904811859131, + "num_tokens": 596171173.0, + "step": 15628 + }, + { + "epoch": 1.9881694440910826, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5796403884887695, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8695951700210571, + "num_tokens": 596208763.0, + "step": 15629 + }, + { + "epoch": 1.9882966543696732, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7002663612365723, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8825185298919678, + "num_tokens": 596241147.0, + "step": 15630 + }, + { + "epoch": 1.9884238646482637, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6300640106201172, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8781309723854065, + "num_tokens": 596279376.0, + "step": 15631 + }, + { + "epoch": 1.9885510749268542, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4749819040298462, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8702186346054077, + "num_tokens": 596321467.0, + "step": 15632 + }, + { + "epoch": 1.9886782852054448, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5030544996261597, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8707461357116699, + "num_tokens": 596362819.0, + "step": 15633 + }, + { + "epoch": 1.988805495484035, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4992443323135376, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8730748891830444, + "num_tokens": 596405442.0, + "step": 15634 + }, + { + "epoch": 1.9889327057626256, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5627892017364502, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8634401559829712, + "num_tokens": 596445443.0, + "step": 15635 + }, + { + "epoch": 1.9890599160412161, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5435845851898193, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8618664145469666, + "num_tokens": 596485518.0, + "step": 15636 + }, + { + "epoch": 1.9891871263198067, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6455425024032593, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.869575023651123, + "num_tokens": 596522112.0, + "step": 15637 + }, + { + "epoch": 1.9893143365983972, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6909891366958618, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8869349956512451, + "num_tokens": 596554210.0, + "step": 15638 + }, + { + "epoch": 1.9894415468769877, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7668925523757935, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8726108074188232, + "num_tokens": 596588926.0, + "step": 15639 + }, + { + "epoch": 1.989568757155578, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5665018558502197, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8684086203575134, + "num_tokens": 596630836.0, + "step": 15640 + }, + { + "epoch": 1.9896959674341685, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5526459217071533, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8793145418167114, + "num_tokens": 596670973.0, + "step": 15641 + }, + { + "epoch": 1.989823177712759, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.595916509628296, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8625272512435913, + "num_tokens": 596713951.0, + "step": 15642 + }, + { + "epoch": 1.9899503879913496, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.510796308517456, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8831771016120911, + "num_tokens": 596754278.0, + "step": 15643 + }, + { + "epoch": 1.9900775982699401, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6461303234100342, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8826852440834045, + "num_tokens": 596787878.0, + "step": 15644 + }, + { + "epoch": 1.9902048085485307, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.554466962814331, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8658081889152527, + "num_tokens": 596830327.0, + "step": 15645 + }, + { + "epoch": 1.9903320188271212, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6478019952774048, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8613841533660889, + "num_tokens": 596869300.0, + "step": 15646 + }, + { + "epoch": 1.9904592291057117, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.615903377532959, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8717113733291626, + "num_tokens": 596911355.0, + "step": 15647 + }, + { + "epoch": 1.9905864393843022, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.566854476928711, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.867887556552887, + "num_tokens": 596948827.0, + "step": 15648 + }, + { + "epoch": 1.9907136496628928, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6538879871368408, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.873416006565094, + "num_tokens": 596980210.0, + "step": 15649 + }, + { + "epoch": 1.9908408599414833, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5566482543945312, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8695646524429321, + "num_tokens": 597021249.0, + "step": 15650 + }, + { + "epoch": 1.9909680702200738, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4718278646469116, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8891845941543579, + "num_tokens": 597062288.0, + "step": 15651 + }, + { + "epoch": 1.9910952804986644, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.649991512298584, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8667830228805542, + "num_tokens": 597106091.0, + "step": 15652 + }, + { + "epoch": 1.9912224907772549, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4888619184494019, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8672928214073181, + "num_tokens": 597149125.0, + "step": 15653 + }, + { + "epoch": 1.9913497010558454, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5669127702713013, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8750275373458862, + "num_tokens": 597188375.0, + "step": 15654 + }, + { + "epoch": 1.991476911334436, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5173149108886719, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.874032735824585, + "num_tokens": 597229264.0, + "step": 15655 + }, + { + "epoch": 1.9916041216130265, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6354632377624512, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8654134273529053, + "num_tokens": 597268805.0, + "step": 15656 + }, + { + "epoch": 1.991731331891617, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.574582815170288, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8669165372848511, + "num_tokens": 597310265.0, + "step": 15657 + }, + { + "epoch": 1.9918585421702073, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.608544945716858, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8822096586227417, + "num_tokens": 597345599.0, + "step": 15658 + }, + { + "epoch": 1.9919857524487978, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.8368098735809326, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8723421096801758, + "num_tokens": 597381658.0, + "step": 15659 + }, + { + "epoch": 1.9921129627273884, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.568807601928711, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8748489618301392, + "num_tokens": 597421044.0, + "step": 15660 + }, + { + "epoch": 1.9922401730059789, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.610277533531189, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8739646077156067, + "num_tokens": 597457742.0, + "step": 15661 + }, + { + "epoch": 1.9923673832845694, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4911588430404663, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8635802268981934, + "num_tokens": 597504927.0, + "step": 15662 + }, + { + "epoch": 1.99249459356316, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6759597063064575, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8655997514724731, + "num_tokens": 597541936.0, + "step": 15663 + }, + { + "epoch": 1.9926218038417502, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.952860713005066, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8602073192596436, + "num_tokens": 597571451.0, + "step": 15664 + }, + { + "epoch": 1.9927490141203408, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5031936168670654, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8803619146347046, + "num_tokens": 597612639.0, + "step": 15665 + }, + { + "epoch": 1.9928762243989313, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7103891372680664, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.866763710975647, + "num_tokens": 597646871.0, + "step": 15666 + }, + { + "epoch": 1.9930034346775218, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6179063320159912, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.874573290348053, + "num_tokens": 597687873.0, + "step": 15667 + }, + { + "epoch": 1.9931306449561124, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7561180591583252, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8644149303436279, + "num_tokens": 597720684.0, + "step": 15668 + }, + { + "epoch": 1.993257855234703, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6772898435592651, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8490613102912903, + "num_tokens": 597761439.0, + "step": 15669 + }, + { + "epoch": 1.9933850655132934, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4992527961730957, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8741638660430908, + "num_tokens": 597800823.0, + "step": 15670 + }, + { + "epoch": 1.993512275791884, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6194608211517334, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8825340867042542, + "num_tokens": 597834708.0, + "step": 15671 + }, + { + "epoch": 1.9936394860704745, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.7709380388259888, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8882920742034912, + "num_tokens": 597863569.0, + "step": 15672 + }, + { + "epoch": 1.993766696349065, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4688631296157837, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8702462911605835, + "num_tokens": 597906560.0, + "step": 15673 + }, + { + "epoch": 1.9938939066276555, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4360530376434326, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8745449781417847, + "num_tokens": 597950479.0, + "step": 15674 + }, + { + "epoch": 1.994021116906246, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4529048204421997, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8729573488235474, + "num_tokens": 597994632.0, + "step": 15675 + }, + { + "epoch": 1.9941483271848366, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.623634696006775, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8697587251663208, + "num_tokens": 598030390.0, + "step": 15676 + }, + { + "epoch": 1.9942755374634271, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6123543977737427, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8784690499305725, + "num_tokens": 598067682.0, + "step": 15677 + }, + { + "epoch": 1.9944027477420176, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6629315614700317, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8722943663597107, + "num_tokens": 598106769.0, + "step": 15678 + }, + { + "epoch": 1.9945299580206082, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6735481023788452, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8793753981590271, + "num_tokens": 598143916.0, + "step": 15679 + }, + { + "epoch": 1.9946571682991987, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.741017460823059, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8677035570144653, + "num_tokens": 598180948.0, + "step": 15680 + }, + { + "epoch": 1.9947843785777892, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6408438682556152, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8757175207138062, + "num_tokens": 598219137.0, + "step": 15681 + }, + { + "epoch": 1.9949115888563798, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6420034170150757, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8912401795387268, + "num_tokens": 598258264.0, + "step": 15682 + }, + { + "epoch": 1.99503879913497, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6734005212783813, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8750489354133606, + "num_tokens": 598292991.0, + "step": 15683 + }, + { + "epoch": 1.9951660094135606, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5169700384140015, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8701238632202148, + "num_tokens": 598332232.0, + "step": 15684 + }, + { + "epoch": 1.9952932196921511, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5864832401275635, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8608351945877075, + "num_tokens": 598372980.0, + "step": 15685 + }, + { + "epoch": 1.9954204299707416, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7239603996276855, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8712603449821472, + "num_tokens": 598406362.0, + "step": 15686 + }, + { + "epoch": 1.9955476402493322, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.644980788230896, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8763604760169983, + "num_tokens": 598442127.0, + "step": 15687 + }, + { + "epoch": 1.9956748505279227, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.4480547904968262, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8730379939079285, + "num_tokens": 598490677.0, + "step": 15688 + }, + { + "epoch": 1.995802060806513, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.73127281665802, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8839537501335144, + "num_tokens": 598520970.0, + "step": 15689 + }, + { + "epoch": 1.9959292710851035, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.489237666130066, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8753986358642578, + "num_tokens": 598562514.0, + "step": 15690 + }, + { + "epoch": 1.996056481363694, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5554826259613037, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8686245679855347, + "num_tokens": 598602626.0, + "step": 15691 + }, + { + "epoch": 1.9961836916422846, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5336601734161377, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8664117455482483, + "num_tokens": 598644148.0, + "step": 15692 + }, + { + "epoch": 1.9963109019208751, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6124482154846191, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8739368319511414, + "num_tokens": 598678758.0, + "step": 15693 + }, + { + "epoch": 1.9964381121994657, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6321943998336792, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8822565078735352, + "num_tokens": 598715512.0, + "step": 15694 + }, + { + "epoch": 1.9965653224780562, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5056803226470947, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8620342016220093, + "num_tokens": 598758503.0, + "step": 15695 + }, + { + "epoch": 1.9966925327566467, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5545560121536255, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8718061447143555, + "num_tokens": 598797642.0, + "step": 15696 + }, + { + "epoch": 1.9968197430352372, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5363606214523315, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8675020933151245, + "num_tokens": 598839931.0, + "step": 15697 + }, + { + "epoch": 1.9969469533138278, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6906706094741821, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8814449906349182, + "num_tokens": 598874289.0, + "step": 15698 + }, + { + "epoch": 1.9970741635924183, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5863779783248901, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8695790767669678, + "num_tokens": 598914946.0, + "step": 15699 + }, + { + "epoch": 1.9972013738710088, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5739095211029053, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8781007528305054, + "num_tokens": 598950888.0, + "step": 15700 + }, + { + "epoch": 1.9973285841495994, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.6066924333572388, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8625982999801636, + "num_tokens": 598989563.0, + "step": 15701 + }, + { + "epoch": 1.9974557944281899, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.5444833040237427, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.881528377532959, + "num_tokens": 599023689.0, + "step": 15702 + }, + { + "epoch": 1.9975830047067804, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5831222534179688, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8652083873748779, + "num_tokens": 599061964.0, + "step": 15703 + }, + { + "epoch": 1.997710214985371, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.4847025871276855, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8886678218841553, + "num_tokens": 599102836.0, + "step": 15704 + }, + { + "epoch": 1.9978374252639615, + "ewc_loss": 2.5033950805664062e-05, + "grad_norm": 1.7650424242019653, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8675802946090698, + "num_tokens": 599142429.0, + "step": 15705 + }, + { + "epoch": 1.997964635542552, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.7708733081817627, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8748235702514648, + "num_tokens": 599177815.0, + "step": 15706 + }, + { + "epoch": 1.9980918458211423, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5279150009155273, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8766727447509766, + "num_tokens": 599215473.0, + "step": 15707 + }, + { + "epoch": 1.9982190560997328, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5937292575836182, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8670056462287903, + "num_tokens": 599255987.0, + "step": 15708 + }, + { + "epoch": 1.9983462663783234, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6901322603225708, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8748160600662231, + "num_tokens": 599289592.0, + "step": 15709 + }, + { + "epoch": 1.9984734766569139, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5526965856552124, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8838938474655151, + "num_tokens": 599322651.0, + "step": 15710 + }, + { + "epoch": 1.9986006869355044, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6342651844024658, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8719934821128845, + "num_tokens": 599357802.0, + "step": 15711 + }, + { + "epoch": 1.998727897214095, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6770707368850708, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8677615523338318, + "num_tokens": 599393825.0, + "step": 15712 + }, + { + "epoch": 1.9988551074926852, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.9055203199386597, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8726190328598022, + "num_tokens": 599424596.0, + "step": 15713 + }, + { + "epoch": 1.9989823177712758, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6803629398345947, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8589619994163513, + "num_tokens": 599460534.0, + "step": 15714 + }, + { + "epoch": 1.9991095280498663, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.570008635520935, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8844277262687683, + "num_tokens": 599495758.0, + "step": 15715 + }, + { + "epoch": 1.9992367383284568, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5934782028198242, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8726674914360046, + "num_tokens": 599532103.0, + "step": 15716 + }, + { + "epoch": 1.9993639486070474, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5516893863677979, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8697832822799683, + "num_tokens": 599572200.0, + "step": 15717 + }, + { + "epoch": 1.9994911588856379, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5640249252319336, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8710289001464844, + "num_tokens": 599611882.0, + "step": 15718 + }, + { + "epoch": 1.9996183691642284, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.4982192516326904, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8886677026748657, + "num_tokens": 599652369.0, + "step": 15719 + }, + { + "epoch": 1.999745579442819, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.581602931022644, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8619360327720642, + "num_tokens": 599695407.0, + "step": 15720 + }, + { + "epoch": 1.9998727897214095, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.4906450510025024, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8697491884231567, + "num_tokens": 599737531.0, + "step": 15721 + }, + { + "epoch": 2.0, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5802563428878784, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8774740099906921, + "num_tokens": 599777940.0, + "step": 15722 + }, + { + "epoch": 2.0001272102785905, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.4974288940429688, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8865528106689453, + "num_tokens": 599814586.0, + "step": 15723 + }, + { + "epoch": 2.000254420557181, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.4594800472259521, + "learning_rate": 1e-06, + "loss": 0.3226, + "mean_token_accuracy": 0.8923866748809814, + "num_tokens": 599853859.0, + "step": 15724 + }, + { + "epoch": 2.0003816308357716, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.68641996383667, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8677365779876709, + "num_tokens": 599886515.0, + "step": 15725 + }, + { + "epoch": 2.000508841114362, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6788830757141113, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8711373805999756, + "num_tokens": 599925880.0, + "step": 15726 + }, + { + "epoch": 2.0006360513929526, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6492258310317993, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8724997043609619, + "num_tokens": 599965551.0, + "step": 15727 + }, + { + "epoch": 2.000763261671543, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5305174589157104, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.870675265789032, + "num_tokens": 600007936.0, + "step": 15728 + }, + { + "epoch": 2.0008904719501337, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.665562629699707, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8790901899337769, + "num_tokens": 600044867.0, + "step": 15729 + }, + { + "epoch": 2.0010176822287242, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.7283047437667847, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.863118588924408, + "num_tokens": 600082606.0, + "step": 15730 + }, + { + "epoch": 2.0011448925073148, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6941999197006226, + "learning_rate": 1e-06, + "loss": 0.3284, + "mean_token_accuracy": 0.8904670476913452, + "num_tokens": 600115900.0, + "step": 15731 + }, + { + "epoch": 2.0012721027859053, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.7031645774841309, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8687927722930908, + "num_tokens": 600154559.0, + "step": 15732 + }, + { + "epoch": 2.001399313064496, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6724300384521484, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.862821638584137, + "num_tokens": 600200557.0, + "step": 15733 + }, + { + "epoch": 2.0015265233430863, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6080496311187744, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8844443559646606, + "num_tokens": 600242416.0, + "step": 15734 + }, + { + "epoch": 2.0016537336216764, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.7687301635742188, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8754997849464417, + "num_tokens": 600276834.0, + "step": 15735 + }, + { + "epoch": 2.001780943900267, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6471408605575562, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8665488362312317, + "num_tokens": 600314209.0, + "step": 15736 + }, + { + "epoch": 2.0019081541788575, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.629938006401062, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8751112222671509, + "num_tokens": 600352155.0, + "step": 15737 + }, + { + "epoch": 2.002035364457448, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.7088834047317505, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8744067549705505, + "num_tokens": 600387656.0, + "step": 15738 + }, + { + "epoch": 2.0021625747360385, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.7454479932785034, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8791013360023499, + "num_tokens": 600422190.0, + "step": 15739 + }, + { + "epoch": 2.002289785014629, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6437793970108032, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8704169988632202, + "num_tokens": 600459689.0, + "step": 15740 + }, + { + "epoch": 2.0024169952932196, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6947294473648071, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8784181475639343, + "num_tokens": 600494434.0, + "step": 15741 + }, + { + "epoch": 2.00254420557181, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.567840337753296, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8791391253471375, + "num_tokens": 600536169.0, + "step": 15742 + }, + { + "epoch": 2.0026714158504006, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.5210179090499878, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8759828805923462, + "num_tokens": 600578085.0, + "step": 15743 + }, + { + "epoch": 2.002798626128991, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.751935362815857, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8671445846557617, + "num_tokens": 600613954.0, + "step": 15744 + }, + { + "epoch": 2.0029258364075817, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.7183423042297363, + "learning_rate": 1e-06, + "loss": 0.3132, + "mean_token_accuracy": 0.8989119529724121, + "num_tokens": 600647728.0, + "step": 15745 + }, + { + "epoch": 2.0030530466861722, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.970413327217102, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8442012071609497, + "num_tokens": 600685224.0, + "step": 15746 + }, + { + "epoch": 2.0031802569647628, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.536144495010376, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.885138213634491, + "num_tokens": 600724261.0, + "step": 15747 + }, + { + "epoch": 2.0033074672433533, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6774406433105469, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8691411018371582, + "num_tokens": 600768004.0, + "step": 15748 + }, + { + "epoch": 2.003434677521944, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.6702953577041626, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8884934186935425, + "num_tokens": 600803289.0, + "step": 15749 + }, + { + "epoch": 2.0035618878005343, + "ewc_loss": 2.5153160095214844e-05, + "grad_norm": 1.663335919380188, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8805631399154663, + "num_tokens": 600843845.0, + "step": 15750 + }, + { + "epoch": 2.003689098079125, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.895273208618164, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8670376539230347, + "num_tokens": 600875559.0, + "step": 15751 + }, + { + "epoch": 2.0038163083577154, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7393232583999634, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8804460763931274, + "num_tokens": 600911028.0, + "step": 15752 + }, + { + "epoch": 2.003943518636306, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6595463752746582, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8827962875366211, + "num_tokens": 600948314.0, + "step": 15753 + }, + { + "epoch": 2.0040707289148965, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.61043381690979, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8787399530410767, + "num_tokens": 600985958.0, + "step": 15754 + }, + { + "epoch": 2.004197939193487, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7685472965240479, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8734735250473022, + "num_tokens": 601020262.0, + "step": 15755 + }, + { + "epoch": 2.0043251494720775, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6140453815460205, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8868610262870789, + "num_tokens": 601063611.0, + "step": 15756 + }, + { + "epoch": 2.004452359750668, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7413356304168701, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8771588802337646, + "num_tokens": 601096397.0, + "step": 15757 + }, + { + "epoch": 2.0045795700292586, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5899661779403687, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8828505277633667, + "num_tokens": 601137902.0, + "step": 15758 + }, + { + "epoch": 2.0047067803078487, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5089906454086304, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.891421914100647, + "num_tokens": 601178563.0, + "step": 15759 + }, + { + "epoch": 2.004833990586439, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7194254398345947, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8566712141036987, + "num_tokens": 601221387.0, + "step": 15760 + }, + { + "epoch": 2.0049612008650297, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6298978328704834, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8849380612373352, + "num_tokens": 601255152.0, + "step": 15761 + }, + { + "epoch": 2.0050884111436202, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7119299173355103, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8869699239730835, + "num_tokens": 601286210.0, + "step": 15762 + }, + { + "epoch": 2.0052156214222108, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6397693157196045, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8759430646896362, + "num_tokens": 601325647.0, + "step": 15763 + }, + { + "epoch": 2.0053428317008013, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6716362237930298, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.884109616279602, + "num_tokens": 601362291.0, + "step": 15764 + }, + { + "epoch": 2.005470041979392, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6083821058273315, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8745121955871582, + "num_tokens": 601403717.0, + "step": 15765 + }, + { + "epoch": 2.0055972522579824, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.566651463508606, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8726798295974731, + "num_tokens": 601446052.0, + "step": 15766 + }, + { + "epoch": 2.005724462536573, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7454520463943481, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.877671480178833, + "num_tokens": 601480380.0, + "step": 15767 + }, + { + "epoch": 2.0058516728151634, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5960639715194702, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8787376284599304, + "num_tokens": 601519378.0, + "step": 15768 + }, + { + "epoch": 2.005978883093754, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6040197610855103, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8834147453308105, + "num_tokens": 601558891.0, + "step": 15769 + }, + { + "epoch": 2.0061060933723445, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7430131435394287, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8725651502609253, + "num_tokens": 601589963.0, + "step": 15770 + }, + { + "epoch": 2.006233303650935, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.659355878829956, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8777667284011841, + "num_tokens": 601627497.0, + "step": 15771 + }, + { + "epoch": 2.0063605139295255, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6889371871948242, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8868059515953064, + "num_tokens": 601662498.0, + "step": 15772 + }, + { + "epoch": 2.006487724208116, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6172431707382202, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8745852112770081, + "num_tokens": 601699524.0, + "step": 15773 + }, + { + "epoch": 2.0066149344867066, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.616033673286438, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8800264596939087, + "num_tokens": 601737893.0, + "step": 15774 + }, + { + "epoch": 2.006742144765297, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.64314603805542, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8790857791900635, + "num_tokens": 601773660.0, + "step": 15775 + }, + { + "epoch": 2.0068693550438876, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.553607702255249, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8861453533172607, + "num_tokens": 601813150.0, + "step": 15776 + }, + { + "epoch": 2.006996565322478, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6081973314285278, + "learning_rate": 1e-06, + "loss": 0.3108, + "mean_token_accuracy": 0.896757960319519, + "num_tokens": 601848753.0, + "step": 15777 + }, + { + "epoch": 2.0071237756010687, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6791324615478516, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.875627875328064, + "num_tokens": 601885879.0, + "step": 15778 + }, + { + "epoch": 2.007250985879659, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.9199671745300293, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8780621290206909, + "num_tokens": 601922999.0, + "step": 15779 + }, + { + "epoch": 2.0073781961582498, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.9363993406295776, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8606762290000916, + "num_tokens": 601958022.0, + "step": 15780 + }, + { + "epoch": 2.0075054064368403, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.8174149990081787, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8695553541183472, + "num_tokens": 601992950.0, + "step": 15781 + }, + { + "epoch": 2.007632616715431, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6365714073181152, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8697701692581177, + "num_tokens": 602032871.0, + "step": 15782 + }, + { + "epoch": 2.0077598269940213, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6074943542480469, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8769875764846802, + "num_tokens": 602069093.0, + "step": 15783 + }, + { + "epoch": 2.0078870372726114, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.793617844581604, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8698625564575195, + "num_tokens": 602102909.0, + "step": 15784 + }, + { + "epoch": 2.008014247551202, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7833408117294312, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8776415586471558, + "num_tokens": 602138119.0, + "step": 15785 + }, + { + "epoch": 2.0081414578297925, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.8437281847000122, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8846904039382935, + "num_tokens": 602173885.0, + "step": 15786 + }, + { + "epoch": 2.008268668108383, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5431623458862305, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8795433044433594, + "num_tokens": 602219272.0, + "step": 15787 + }, + { + "epoch": 2.0083958783869735, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6150848865509033, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8624891042709351, + "num_tokens": 602261158.0, + "step": 15788 + }, + { + "epoch": 2.008523088665564, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7098125219345093, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8846187591552734, + "num_tokens": 602297266.0, + "step": 15789 + }, + { + "epoch": 2.0086502989441546, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.716130256652832, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.872108519077301, + "num_tokens": 602333759.0, + "step": 15790 + }, + { + "epoch": 2.008777509222745, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.709832787513733, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8620625734329224, + "num_tokens": 602369317.0, + "step": 15791 + }, + { + "epoch": 2.0089047195013356, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7054848670959473, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8814666271209717, + "num_tokens": 602403590.0, + "step": 15792 + }, + { + "epoch": 2.009031929779926, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7260138988494873, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8707793951034546, + "num_tokens": 602439459.0, + "step": 15793 + }, + { + "epoch": 2.0091591400585167, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.620909333229065, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8795948028564453, + "num_tokens": 602479151.0, + "step": 15794 + }, + { + "epoch": 2.0092863503371072, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5617092847824097, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8876685500144958, + "num_tokens": 602517029.0, + "step": 15795 + }, + { + "epoch": 2.0094135606156978, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5843480825424194, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.879848062992096, + "num_tokens": 602558000.0, + "step": 15796 + }, + { + "epoch": 2.0095407708942883, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.587969183921814, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8791458606719971, + "num_tokens": 602599411.0, + "step": 15797 + }, + { + "epoch": 2.009667981172879, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.4209020137786865, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8732408285140991, + "num_tokens": 602646188.0, + "step": 15798 + }, + { + "epoch": 2.0097951914514693, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5573948621749878, + "learning_rate": 1e-06, + "loss": 0.311, + "mean_token_accuracy": 0.8959572315216064, + "num_tokens": 602683739.0, + "step": 15799 + }, + { + "epoch": 2.00992240173006, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.774389624595642, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8795614242553711, + "num_tokens": 602716665.0, + "step": 15800 + }, + { + "epoch": 2.0100496120086504, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6760289669036865, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.870398759841919, + "num_tokens": 602752527.0, + "step": 15801 + }, + { + "epoch": 2.010176822287241, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7004340887069702, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8887261152267456, + "num_tokens": 602784134.0, + "step": 15802 + }, + { + "epoch": 2.0103040325658315, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5196675062179565, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8894543647766113, + "num_tokens": 602823563.0, + "step": 15803 + }, + { + "epoch": 2.010431242844422, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.538434624671936, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8681018352508545, + "num_tokens": 602867202.0, + "step": 15804 + }, + { + "epoch": 2.0105584531230125, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6586248874664307, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8670057654380798, + "num_tokens": 602903990.0, + "step": 15805 + }, + { + "epoch": 2.010685663401603, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6374928951263428, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8716446161270142, + "num_tokens": 602942421.0, + "step": 15806 + }, + { + "epoch": 2.0108128736801936, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6672868728637695, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8829847574234009, + "num_tokens": 602977797.0, + "step": 15807 + }, + { + "epoch": 2.0109400839587837, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6340866088867188, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.884627103805542, + "num_tokens": 603015188.0, + "step": 15808 + }, + { + "epoch": 2.011067294237374, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6340126991271973, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8681389093399048, + "num_tokens": 603055113.0, + "step": 15809 + }, + { + "epoch": 2.0111945045159647, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6811777353286743, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8780087232589722, + "num_tokens": 603088829.0, + "step": 15810 + }, + { + "epoch": 2.0113217147945552, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6727195978164673, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8777023553848267, + "num_tokens": 603127443.0, + "step": 15811 + }, + { + "epoch": 2.0114489250731458, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.7024091482162476, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8638339042663574, + "num_tokens": 603166279.0, + "step": 15812 + }, + { + "epoch": 2.0115761353517363, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5411083698272705, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8813371658325195, + "num_tokens": 603210769.0, + "step": 15813 + }, + { + "epoch": 2.011703345630327, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5861783027648926, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8810316920280457, + "num_tokens": 603249826.0, + "step": 15814 + }, + { + "epoch": 2.0118305559089174, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.657140851020813, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8715441226959229, + "num_tokens": 603287722.0, + "step": 15815 + }, + { + "epoch": 2.011957766187508, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5671261548995972, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8748747110366821, + "num_tokens": 603325832.0, + "step": 15816 + }, + { + "epoch": 2.0120849764660984, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.5763972997665405, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8882095217704773, + "num_tokens": 603362905.0, + "step": 15817 + }, + { + "epoch": 2.012212186744689, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6698325872421265, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8652065396308899, + "num_tokens": 603399514.0, + "step": 15818 + }, + { + "epoch": 2.0123393970232795, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7785550355911255, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.888161301612854, + "num_tokens": 603440645.0, + "step": 15819 + }, + { + "epoch": 2.01246660730187, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.6049877405166626, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8708513379096985, + "num_tokens": 603480055.0, + "step": 15820 + }, + { + "epoch": 2.0125938175804605, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.4995695352554321, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8807294368743896, + "num_tokens": 603526792.0, + "step": 15821 + }, + { + "epoch": 2.012721027859051, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.4819730520248413, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8903313875198364, + "num_tokens": 603567844.0, + "step": 15822 + }, + { + "epoch": 2.0128482381376416, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5981636047363281, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8728105425834656, + "num_tokens": 603610549.0, + "step": 15823 + }, + { + "epoch": 2.012975448416232, + "ewc_loss": 2.5272369384765625e-05, + "grad_norm": 1.606895923614502, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8750848174095154, + "num_tokens": 603651652.0, + "step": 15824 + }, + { + "epoch": 2.0131026586948226, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6478527784347534, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8839203119277954, + "num_tokens": 603688671.0, + "step": 15825 + }, + { + "epoch": 2.013229868973413, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7315492630004883, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8752098083496094, + "num_tokens": 603725009.0, + "step": 15826 + }, + { + "epoch": 2.0133570792520037, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.8177618980407715, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8614331483840942, + "num_tokens": 603761938.0, + "step": 15827 + }, + { + "epoch": 2.013484289530594, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5800572633743286, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8769913911819458, + "num_tokens": 603803861.0, + "step": 15828 + }, + { + "epoch": 2.0136114998091847, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.692391037940979, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8642353415489197, + "num_tokens": 603842109.0, + "step": 15829 + }, + { + "epoch": 2.0137387100877753, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 3.731832504272461, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8881021738052368, + "num_tokens": 603875136.0, + "step": 15830 + }, + { + "epoch": 2.013865920366366, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6023173332214355, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8594896793365479, + "num_tokens": 603918250.0, + "step": 15831 + }, + { + "epoch": 2.0139931306449563, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6821216344833374, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8597623705863953, + "num_tokens": 603958564.0, + "step": 15832 + }, + { + "epoch": 2.0141203409235464, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5494776964187622, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8902678489685059, + "num_tokens": 603996541.0, + "step": 15833 + }, + { + "epoch": 2.014247551202137, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6059547662734985, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8695932626724243, + "num_tokens": 604038281.0, + "step": 15834 + }, + { + "epoch": 2.0143747614807275, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6353774070739746, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8754276037216187, + "num_tokens": 604075360.0, + "step": 15835 + }, + { + "epoch": 2.014501971759318, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7969108819961548, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8737499117851257, + "num_tokens": 604108151.0, + "step": 15836 + }, + { + "epoch": 2.0146291820379085, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.720177412033081, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8635372519493103, + "num_tokens": 604149776.0, + "step": 15837 + }, + { + "epoch": 2.014756392316499, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.605208158493042, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8872760534286499, + "num_tokens": 604187417.0, + "step": 15838 + }, + { + "epoch": 2.0148836025950896, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6336374282836914, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8729691505432129, + "num_tokens": 604225462.0, + "step": 15839 + }, + { + "epoch": 2.01501081287368, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6935229301452637, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8747178316116333, + "num_tokens": 604263154.0, + "step": 15840 + }, + { + "epoch": 2.0151380231522706, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5415722131729126, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8747912645339966, + "num_tokens": 604307594.0, + "step": 15841 + }, + { + "epoch": 2.015265233430861, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6530683040618896, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8776301145553589, + "num_tokens": 604342106.0, + "step": 15842 + }, + { + "epoch": 2.0153924437094517, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.686182975769043, + "learning_rate": 1e-06, + "loss": 0.3192, + "mean_token_accuracy": 0.8934016227722168, + "num_tokens": 604376231.0, + "step": 15843 + }, + { + "epoch": 2.0155196539880422, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7808265686035156, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8769755363464355, + "num_tokens": 604408457.0, + "step": 15844 + }, + { + "epoch": 2.0156468642666328, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.672808051109314, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8618767857551575, + "num_tokens": 604448758.0, + "step": 15845 + }, + { + "epoch": 2.0157740745452233, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6332558393478394, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8783577084541321, + "num_tokens": 604486755.0, + "step": 15846 + }, + { + "epoch": 2.015901284823814, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.8860139846801758, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8785799741744995, + "num_tokens": 604515792.0, + "step": 15847 + }, + { + "epoch": 2.0160284951024043, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6155894994735718, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8847686052322388, + "num_tokens": 604553058.0, + "step": 15848 + }, + { + "epoch": 2.016155705380995, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.623539686203003, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8749716877937317, + "num_tokens": 604592327.0, + "step": 15849 + }, + { + "epoch": 2.0162829156595854, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.753366470336914, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8784830570220947, + "num_tokens": 604624385.0, + "step": 15850 + }, + { + "epoch": 2.016410125938176, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5690183639526367, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8751181960105896, + "num_tokens": 604665620.0, + "step": 15851 + }, + { + "epoch": 2.0165373362167665, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6947888135910034, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8669382929801941, + "num_tokens": 604704073.0, + "step": 15852 + }, + { + "epoch": 2.016664546495357, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5627977848052979, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8672139644622803, + "num_tokens": 604748696.0, + "step": 15853 + }, + { + "epoch": 2.0167917567739475, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7184582948684692, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8739596605300903, + "num_tokens": 604781994.0, + "step": 15854 + }, + { + "epoch": 2.016918967052538, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7422033548355103, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.862584114074707, + "num_tokens": 604819520.0, + "step": 15855 + }, + { + "epoch": 2.0170461773311286, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.8176590204238892, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8731980919837952, + "num_tokens": 604854846.0, + "step": 15856 + }, + { + "epoch": 2.0171733876097186, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.610874891281128, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8598853349685669, + "num_tokens": 604893989.0, + "step": 15857 + }, + { + "epoch": 2.017300597888309, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6249839067459106, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8740853071212769, + "num_tokens": 604932006.0, + "step": 15858 + }, + { + "epoch": 2.0174278081668997, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5734273195266724, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8806090354919434, + "num_tokens": 604972015.0, + "step": 15859 + }, + { + "epoch": 2.0175550184454902, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6041364669799805, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8831465244293213, + "num_tokens": 605012127.0, + "step": 15860 + }, + { + "epoch": 2.0176822287240808, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5152651071548462, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8835150003433228, + "num_tokens": 605052896.0, + "step": 15861 + }, + { + "epoch": 2.0178094390026713, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.5420178174972534, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8858648538589478, + "num_tokens": 605095881.0, + "step": 15862 + }, + { + "epoch": 2.017936649281262, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6802269220352173, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8906850814819336, + "num_tokens": 605126714.0, + "step": 15863 + }, + { + "epoch": 2.0180638595598523, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6333470344543457, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8867809772491455, + "num_tokens": 605161819.0, + "step": 15864 + }, + { + "epoch": 2.018191069838443, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6416369676589966, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8568471074104309, + "num_tokens": 605202611.0, + "step": 15865 + }, + { + "epoch": 2.0183182801170334, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6692428588867188, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8784528970718384, + "num_tokens": 605238316.0, + "step": 15866 + }, + { + "epoch": 2.018445490395624, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6752026081085205, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8821576833724976, + "num_tokens": 605274724.0, + "step": 15867 + }, + { + "epoch": 2.0185727006742145, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6444789171218872, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8658348321914673, + "num_tokens": 605312969.0, + "step": 15868 + }, + { + "epoch": 2.018699910952805, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.8856385946273804, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8715186715126038, + "num_tokens": 605345481.0, + "step": 15869 + }, + { + "epoch": 2.0188271212313955, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6508761644363403, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8752976655960083, + "num_tokens": 605382767.0, + "step": 15870 + }, + { + "epoch": 2.018954331509986, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6134577989578247, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8750414252281189, + "num_tokens": 605424451.0, + "step": 15871 + }, + { + "epoch": 2.0190815417885766, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7894073724746704, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8832799792289734, + "num_tokens": 605455243.0, + "step": 15872 + }, + { + "epoch": 2.019208752067167, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7213177680969238, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8853925466537476, + "num_tokens": 605489326.0, + "step": 15873 + }, + { + "epoch": 2.0193359623457576, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6440829038619995, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8726751804351807, + "num_tokens": 605526836.0, + "step": 15874 + }, + { + "epoch": 2.019463172624348, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.7368320226669312, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8632566928863525, + "num_tokens": 605564924.0, + "step": 15875 + }, + { + "epoch": 2.0195903829029387, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.645072340965271, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.884669840335846, + "num_tokens": 605599345.0, + "step": 15876 + }, + { + "epoch": 2.019717593181529, + "ewc_loss": 2.5391578674316406e-05, + "grad_norm": 1.6180927753448486, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8887516856193542, + "num_tokens": 605636981.0, + "step": 15877 + }, + { + "epoch": 2.0198448034601197, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6619454622268677, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8807713985443115, + "num_tokens": 605672740.0, + "step": 15878 + }, + { + "epoch": 2.0199720137387103, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6619324684143066, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8868481516838074, + "num_tokens": 605710525.0, + "step": 15879 + }, + { + "epoch": 2.020099224017301, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5980713367462158, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8867838382720947, + "num_tokens": 605748305.0, + "step": 15880 + }, + { + "epoch": 2.0202264342958913, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5945435762405396, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8917897939682007, + "num_tokens": 605784604.0, + "step": 15881 + }, + { + "epoch": 2.0203536445744814, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.4772571325302124, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8808809518814087, + "num_tokens": 605830908.0, + "step": 15882 + }, + { + "epoch": 2.020480854853072, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.633632779121399, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8734244108200073, + "num_tokens": 605869634.0, + "step": 15883 + }, + { + "epoch": 2.0206080651316625, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6468218564987183, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8726987838745117, + "num_tokens": 605906061.0, + "step": 15884 + }, + { + "epoch": 2.020735275410253, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6873712539672852, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8824771642684937, + "num_tokens": 605942494.0, + "step": 15885 + }, + { + "epoch": 2.0208624856888435, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6706544160842896, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.878940999507904, + "num_tokens": 605977442.0, + "step": 15886 + }, + { + "epoch": 2.020989695967434, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.655077338218689, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8741880655288696, + "num_tokens": 606014018.0, + "step": 15887 + }, + { + "epoch": 2.0211169062460246, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7679802179336548, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8776350021362305, + "num_tokens": 606050829.0, + "step": 15888 + }, + { + "epoch": 2.021244116524615, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5460723638534546, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8713908195495605, + "num_tokens": 606095464.0, + "step": 15889 + }, + { + "epoch": 2.0213713268032056, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6063761711120605, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8790123462677002, + "num_tokens": 606134091.0, + "step": 15890 + }, + { + "epoch": 2.021498537081796, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6986273527145386, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8769472241401672, + "num_tokens": 606169395.0, + "step": 15891 + }, + { + "epoch": 2.0216257473603867, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 2.040354013442993, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8813830614089966, + "num_tokens": 606204876.0, + "step": 15892 + }, + { + "epoch": 2.021752957638977, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6640490293502808, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8788145780563354, + "num_tokens": 606241547.0, + "step": 15893 + }, + { + "epoch": 2.0218801679175677, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.62122642993927, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8773001432418823, + "num_tokens": 606281488.0, + "step": 15894 + }, + { + "epoch": 2.0220073781961583, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6196026802062988, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8785565495491028, + "num_tokens": 606318036.0, + "step": 15895 + }, + { + "epoch": 2.022134588474749, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6855357885360718, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8728440999984741, + "num_tokens": 606356936.0, + "step": 15896 + }, + { + "epoch": 2.0222617987533393, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7038978338241577, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8731784820556641, + "num_tokens": 606391745.0, + "step": 15897 + }, + { + "epoch": 2.02238900903193, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.690759301185608, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8788436651229858, + "num_tokens": 606427699.0, + "step": 15898 + }, + { + "epoch": 2.0225162193105204, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6300851106643677, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8790218830108643, + "num_tokens": 606464095.0, + "step": 15899 + }, + { + "epoch": 2.022643429589111, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.601096272468567, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8714289665222168, + "num_tokens": 606505002.0, + "step": 15900 + }, + { + "epoch": 2.0227706398677014, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.653193473815918, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8826653361320496, + "num_tokens": 606542283.0, + "step": 15901 + }, + { + "epoch": 2.022897850146292, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7888487577438354, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8742576837539673, + "num_tokens": 606577101.0, + "step": 15902 + }, + { + "epoch": 2.0230250604248825, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.650239109992981, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8827800750732422, + "num_tokens": 606615188.0, + "step": 15903 + }, + { + "epoch": 2.023152270703473, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.62473726272583, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.8918192386627197, + "num_tokens": 606652857.0, + "step": 15904 + }, + { + "epoch": 2.0232794809820636, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5914306640625, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8659859895706177, + "num_tokens": 606698333.0, + "step": 15905 + }, + { + "epoch": 2.0234066912606536, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 3.7227017879486084, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8661231398582458, + "num_tokens": 606744773.0, + "step": 15906 + }, + { + "epoch": 2.023533901539244, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.8544906377792358, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8686071038246155, + "num_tokens": 606778618.0, + "step": 15907 + }, + { + "epoch": 2.0236611118178347, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7253895998001099, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8771837949752808, + "num_tokens": 606814486.0, + "step": 15908 + }, + { + "epoch": 2.0237883220964252, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6787898540496826, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8695802688598633, + "num_tokens": 606854608.0, + "step": 15909 + }, + { + "epoch": 2.0239155323750158, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6429864168167114, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8720600605010986, + "num_tokens": 606893240.0, + "step": 15910 + }, + { + "epoch": 2.0240427426536063, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.662176251411438, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.863722562789917, + "num_tokens": 606935185.0, + "step": 15911 + }, + { + "epoch": 2.024169952932197, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.489738941192627, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.898937463760376, + "num_tokens": 606976327.0, + "step": 15912 + }, + { + "epoch": 2.0242971632107873, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6856369972229004, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8764806985855103, + "num_tokens": 607011459.0, + "step": 15913 + }, + { + "epoch": 2.024424373489378, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6544688940048218, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8855987787246704, + "num_tokens": 607049630.0, + "step": 15914 + }, + { + "epoch": 2.0245515837679684, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.588695764541626, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8774569034576416, + "num_tokens": 607088763.0, + "step": 15915 + }, + { + "epoch": 2.024678794046559, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6339718103408813, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.883185625076294, + "num_tokens": 607127652.0, + "step": 15916 + }, + { + "epoch": 2.0248060043251495, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6079747676849365, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8730745315551758, + "num_tokens": 607164552.0, + "step": 15917 + }, + { + "epoch": 2.02493321460374, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6937134265899658, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8775312900543213, + "num_tokens": 607206607.0, + "step": 15918 + }, + { + "epoch": 2.0250604248823305, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6514204740524292, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8892335891723633, + "num_tokens": 607240931.0, + "step": 15919 + }, + { + "epoch": 2.025187635160921, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6153764724731445, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8818092346191406, + "num_tokens": 607277824.0, + "step": 15920 + }, + { + "epoch": 2.0253148454395116, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5964200496673584, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8569484353065491, + "num_tokens": 607319933.0, + "step": 15921 + }, + { + "epoch": 2.025442055718102, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6658250093460083, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8799037933349609, + "num_tokens": 607354480.0, + "step": 15922 + }, + { + "epoch": 2.0255692659966926, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7001491785049438, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8785915374755859, + "num_tokens": 607395435.0, + "step": 15923 + }, + { + "epoch": 2.025696476275283, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6667981147766113, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8603841066360474, + "num_tokens": 607434479.0, + "step": 15924 + }, + { + "epoch": 2.0258236865538737, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6869839429855347, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.860936164855957, + "num_tokens": 607476078.0, + "step": 15925 + }, + { + "epoch": 2.025950896832464, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6369481086730957, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8613032102584839, + "num_tokens": 607514821.0, + "step": 15926 + }, + { + "epoch": 2.0260781071110547, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6647617816925049, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8678004145622253, + "num_tokens": 607551408.0, + "step": 15927 + }, + { + "epoch": 2.0262053173896453, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.511281967163086, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.882112979888916, + "num_tokens": 607593256.0, + "step": 15928 + }, + { + "epoch": 2.026332527668236, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6224937438964844, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8667155504226685, + "num_tokens": 607636271.0, + "step": 15929 + }, + { + "epoch": 2.0264597379468263, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7478867769241333, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8635532855987549, + "num_tokens": 607673639.0, + "step": 15930 + }, + { + "epoch": 2.0265869482254164, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5631133317947388, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8617175817489624, + "num_tokens": 607715104.0, + "step": 15931 + }, + { + "epoch": 2.026714158504007, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5413202047348022, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8746509552001953, + "num_tokens": 607759478.0, + "step": 15932 + }, + { + "epoch": 2.0268413687825975, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 3.760684013366699, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8832126259803772, + "num_tokens": 607801427.0, + "step": 15933 + }, + { + "epoch": 2.026968579061188, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.651543140411377, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8839095234870911, + "num_tokens": 607837981.0, + "step": 15934 + }, + { + "epoch": 2.0270957893397785, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7695204019546509, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8770405650138855, + "num_tokens": 607872202.0, + "step": 15935 + }, + { + "epoch": 2.027222999618369, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6545230150222778, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8608205318450928, + "num_tokens": 607912832.0, + "step": 15936 + }, + { + "epoch": 2.0273502098969596, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.543933391571045, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8697119951248169, + "num_tokens": 607956650.0, + "step": 15937 + }, + { + "epoch": 2.02747742017555, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.544671893119812, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8788081407546997, + "num_tokens": 607996319.0, + "step": 15938 + }, + { + "epoch": 2.0276046304541406, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.601575493812561, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.891132116317749, + "num_tokens": 608033019.0, + "step": 15939 + }, + { + "epoch": 2.027731840732731, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6046547889709473, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8813601732254028, + "num_tokens": 608073546.0, + "step": 15940 + }, + { + "epoch": 2.0278590510113217, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.8116178512573242, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8593409061431885, + "num_tokens": 608113565.0, + "step": 15941 + }, + { + "epoch": 2.027986261289912, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6419382095336914, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8829960823059082, + "num_tokens": 608152229.0, + "step": 15942 + }, + { + "epoch": 2.0281134715685027, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6243187189102173, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8845884203910828, + "num_tokens": 608190538.0, + "step": 15943 + }, + { + "epoch": 2.0282406818470933, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.803942322731018, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8730084896087646, + "num_tokens": 608224630.0, + "step": 15944 + }, + { + "epoch": 2.028367892125684, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5526143312454224, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8874180316925049, + "num_tokens": 608264604.0, + "step": 15945 + }, + { + "epoch": 2.0284951024042743, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6403474807739258, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8759894371032715, + "num_tokens": 608304528.0, + "step": 15946 + }, + { + "epoch": 2.028622312682865, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.619288682937622, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8767547607421875, + "num_tokens": 608344815.0, + "step": 15947 + }, + { + "epoch": 2.0287495229614554, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6532721519470215, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8742719888687134, + "num_tokens": 608388385.0, + "step": 15948 + }, + { + "epoch": 2.028876733240046, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6198153495788574, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.864453136920929, + "num_tokens": 608427548.0, + "step": 15949 + }, + { + "epoch": 2.0290039435186364, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5466995239257812, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8725360035896301, + "num_tokens": 608471385.0, + "step": 15950 + }, + { + "epoch": 2.029131153797227, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7304656505584717, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8608378767967224, + "num_tokens": 608509599.0, + "step": 15951 + }, + { + "epoch": 2.0292583640758175, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6070573329925537, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8777444958686829, + "num_tokens": 608546685.0, + "step": 15952 + }, + { + "epoch": 2.029385574354408, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.571256160736084, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8736904859542847, + "num_tokens": 608589141.0, + "step": 15953 + }, + { + "epoch": 2.0295127846329986, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6547209024429321, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8813506364822388, + "num_tokens": 608626671.0, + "step": 15954 + }, + { + "epoch": 2.0296399949115886, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5566843748092651, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8618764877319336, + "num_tokens": 608670695.0, + "step": 15955 + }, + { + "epoch": 2.029767205190179, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7533596754074097, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8781164884567261, + "num_tokens": 608703424.0, + "step": 15956 + }, + { + "epoch": 2.0298944154687697, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5884661674499512, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8836771845817566, + "num_tokens": 608741652.0, + "step": 15957 + }, + { + "epoch": 2.0300216257473602, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6255700588226318, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8722788095474243, + "num_tokens": 608780990.0, + "step": 15958 + }, + { + "epoch": 2.0301488360259508, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6331120729446411, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8731398582458496, + "num_tokens": 608822604.0, + "step": 15959 + }, + { + "epoch": 2.0302760463045413, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.493776559829712, + "learning_rate": 1e-06, + "loss": 0.3025, + "mean_token_accuracy": 0.899105966091156, + "num_tokens": 608861846.0, + "step": 15960 + }, + { + "epoch": 2.030403256583132, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6425665616989136, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8660624027252197, + "num_tokens": 608902731.0, + "step": 15961 + }, + { + "epoch": 2.0305304668617223, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.727512001991272, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8795637488365173, + "num_tokens": 608941147.0, + "step": 15962 + }, + { + "epoch": 2.030657677140313, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.614226222038269, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.879138171672821, + "num_tokens": 608984058.0, + "step": 15963 + }, + { + "epoch": 2.0307848874189034, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6338881254196167, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8804195523262024, + "num_tokens": 609022947.0, + "step": 15964 + }, + { + "epoch": 2.030912097697494, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6852761507034302, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8711585998535156, + "num_tokens": 609061324.0, + "step": 15965 + }, + { + "epoch": 2.0310393079760845, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5813554525375366, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8824862241744995, + "num_tokens": 609100664.0, + "step": 15966 + }, + { + "epoch": 2.031166518254675, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6099247932434082, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8837400674819946, + "num_tokens": 609144429.0, + "step": 15967 + }, + { + "epoch": 2.0312937285332655, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6692755222320557, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8676915168762207, + "num_tokens": 609184559.0, + "step": 15968 + }, + { + "epoch": 2.031420938811856, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.768495798110962, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8735960125923157, + "num_tokens": 609216970.0, + "step": 15969 + }, + { + "epoch": 2.0315481490904466, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.4826509952545166, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8845454454421997, + "num_tokens": 609259148.0, + "step": 15970 + }, + { + "epoch": 2.031675359369037, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5557717084884644, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8813623189926147, + "num_tokens": 609296589.0, + "step": 15971 + }, + { + "epoch": 2.0318025696476276, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5336445569992065, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8838022947311401, + "num_tokens": 609337787.0, + "step": 15972 + }, + { + "epoch": 2.031929779926218, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6947755813598633, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8670523762702942, + "num_tokens": 609373192.0, + "step": 15973 + }, + { + "epoch": 2.0320569902048087, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.6942071914672852, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8682893514633179, + "num_tokens": 609410438.0, + "step": 15974 + }, + { + "epoch": 2.032184200483399, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7128472328186035, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8757680654525757, + "num_tokens": 609448686.0, + "step": 15975 + }, + { + "epoch": 2.0323114107619897, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.8314822912216187, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8839877843856812, + "num_tokens": 609482130.0, + "step": 15976 + }, + { + "epoch": 2.0324386210405803, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.7747045755386353, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8798086643218994, + "num_tokens": 609513044.0, + "step": 15977 + }, + { + "epoch": 2.032565831319171, + "ewc_loss": 2.5510787963867188e-05, + "grad_norm": 1.5843020677566528, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8723364472389221, + "num_tokens": 609553112.0, + "step": 15978 + }, + { + "epoch": 2.032693041597761, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.698028326034546, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8531705737113953, + "num_tokens": 609592039.0, + "step": 15979 + }, + { + "epoch": 2.0328202518763514, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7055978775024414, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8738507628440857, + "num_tokens": 609626848.0, + "step": 15980 + }, + { + "epoch": 2.032947462154942, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7357321977615356, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8750287890434265, + "num_tokens": 609661626.0, + "step": 15981 + }, + { + "epoch": 2.0330746724335325, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.548403263092041, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.86175537109375, + "num_tokens": 609706434.0, + "step": 15982 + }, + { + "epoch": 2.033201882712123, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.8972208499908447, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8759657144546509, + "num_tokens": 609741244.0, + "step": 15983 + }, + { + "epoch": 2.0333290929907135, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7729820013046265, + "learning_rate": 1e-06, + "loss": 0.3241, + "mean_token_accuracy": 0.8893387317657471, + "num_tokens": 609773798.0, + "step": 15984 + }, + { + "epoch": 2.033456303269304, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5323164463043213, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8749039173126221, + "num_tokens": 609818535.0, + "step": 15985 + }, + { + "epoch": 2.0335835135478946, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.4732424020767212, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8927335739135742, + "num_tokens": 609861366.0, + "step": 15986 + }, + { + "epoch": 2.033710723826485, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6168164014816284, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8619874715805054, + "num_tokens": 609904392.0, + "step": 15987 + }, + { + "epoch": 2.0338379341050756, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6618773937225342, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8693326115608215, + "num_tokens": 609941579.0, + "step": 15988 + }, + { + "epoch": 2.033965144383666, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.696587324142456, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8783513307571411, + "num_tokens": 609982183.0, + "step": 15989 + }, + { + "epoch": 2.0340923546622567, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.4981331825256348, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8795666694641113, + "num_tokens": 610025790.0, + "step": 15990 + }, + { + "epoch": 2.034219564940847, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.644880771636963, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8801922798156738, + "num_tokens": 610069220.0, + "step": 15991 + }, + { + "epoch": 2.0343467752194377, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.935370683670044, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8627310991287231, + "num_tokens": 610101730.0, + "step": 15992 + }, + { + "epoch": 2.0344739854980283, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6690020561218262, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8607916831970215, + "num_tokens": 610144013.0, + "step": 15993 + }, + { + "epoch": 2.034601195776619, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6794828176498413, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8647903203964233, + "num_tokens": 610183880.0, + "step": 15994 + }, + { + "epoch": 2.0347284060552093, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7527720928192139, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8672909736633301, + "num_tokens": 610222087.0, + "step": 15995 + }, + { + "epoch": 2.0348556163338, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6987706422805786, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8649168014526367, + "num_tokens": 610257983.0, + "step": 15996 + }, + { + "epoch": 2.0349828266123904, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5126488208770752, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8819844722747803, + "num_tokens": 610301401.0, + "step": 15997 + }, + { + "epoch": 2.035110036890981, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5152266025543213, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.877813994884491, + "num_tokens": 610343352.0, + "step": 15998 + }, + { + "epoch": 2.0352372471695714, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.75730299949646, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8845627903938293, + "num_tokens": 610374699.0, + "step": 15999 + }, + { + "epoch": 2.035364457448162, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6366058588027954, + "learning_rate": 1e-06, + "loss": 0.3141, + "mean_token_accuracy": 0.8962854146957397, + "num_tokens": 610407591.0, + "step": 16000 + }, + { + "epoch": 2.0354916677267525, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.615037202835083, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8852530121803284, + "num_tokens": 610446017.0, + "step": 16001 + }, + { + "epoch": 2.035618878005343, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7301017045974731, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8722131848335266, + "num_tokens": 610483867.0, + "step": 16002 + }, + { + "epoch": 2.0357460882839336, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.742756962776184, + "learning_rate": 1e-06, + "loss": 0.337, + "mean_token_accuracy": 0.8921306133270264, + "num_tokens": 610516573.0, + "step": 16003 + }, + { + "epoch": 2.0358732985625236, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7000024318695068, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8775349259376526, + "num_tokens": 610552061.0, + "step": 16004 + }, + { + "epoch": 2.036000508841114, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.759570598602295, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8674340844154358, + "num_tokens": 610585675.0, + "step": 16005 + }, + { + "epoch": 2.0361277191197047, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.8341413736343384, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.881734311580658, + "num_tokens": 610617750.0, + "step": 16006 + }, + { + "epoch": 2.036254929398295, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.8135865926742554, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.875443160533905, + "num_tokens": 610652083.0, + "step": 16007 + }, + { + "epoch": 2.0363821396768857, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.9466572999954224, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8683391809463501, + "num_tokens": 610685625.0, + "step": 16008 + }, + { + "epoch": 2.0365093499554763, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5510963201522827, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8777164220809937, + "num_tokens": 610727368.0, + "step": 16009 + }, + { + "epoch": 2.036636560234067, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6514767408370972, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.883028507232666, + "num_tokens": 610763713.0, + "step": 16010 + }, + { + "epoch": 2.0367637705126573, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.532077670097351, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8840951323509216, + "num_tokens": 610805556.0, + "step": 16011 + }, + { + "epoch": 2.036890980791248, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5686886310577393, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8815404176712036, + "num_tokens": 610850143.0, + "step": 16012 + }, + { + "epoch": 2.0370181910698384, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.613369345664978, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8720798492431641, + "num_tokens": 610890634.0, + "step": 16013 + }, + { + "epoch": 2.037145401348429, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6153525114059448, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8753526210784912, + "num_tokens": 610926688.0, + "step": 16014 + }, + { + "epoch": 2.0372726116270194, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6697065830230713, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8632664680480957, + "num_tokens": 610962182.0, + "step": 16015 + }, + { + "epoch": 2.03739982190561, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5431700944900513, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8895947933197021, + "num_tokens": 611000718.0, + "step": 16016 + }, + { + "epoch": 2.0375270321842005, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.569775938987732, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8834283351898193, + "num_tokens": 611041816.0, + "step": 16017 + }, + { + "epoch": 2.037654242462791, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.748523235321045, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8711925148963928, + "num_tokens": 611075945.0, + "step": 16018 + }, + { + "epoch": 2.0377814527413816, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6880306005477905, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8764614462852478, + "num_tokens": 611112077.0, + "step": 16019 + }, + { + "epoch": 2.037908663019972, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6505342721939087, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8761171102523804, + "num_tokens": 611150099.0, + "step": 16020 + }, + { + "epoch": 2.0380358732985626, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5999252796173096, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8711864948272705, + "num_tokens": 611190559.0, + "step": 16021 + }, + { + "epoch": 2.038163083577153, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.707905888557434, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8697079420089722, + "num_tokens": 611227136.0, + "step": 16022 + }, + { + "epoch": 2.0382902938557437, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5494903326034546, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8866479992866516, + "num_tokens": 611266559.0, + "step": 16023 + }, + { + "epoch": 2.038417504134334, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.5179438591003418, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8951104879379272, + "num_tokens": 611306618.0, + "step": 16024 + }, + { + "epoch": 2.0385447144129247, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.579508900642395, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8794921636581421, + "num_tokens": 611347587.0, + "step": 16025 + }, + { + "epoch": 2.0386719246915153, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6809252500534058, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.8907724618911743, + "num_tokens": 611382640.0, + "step": 16026 + }, + { + "epoch": 2.038799134970106, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5303518772125244, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8809235692024231, + "num_tokens": 611425113.0, + "step": 16027 + }, + { + "epoch": 2.0389263452486963, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.8272920846939087, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.866805374622345, + "num_tokens": 611458391.0, + "step": 16028 + }, + { + "epoch": 2.0390535555272864, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7683212757110596, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8524879217147827, + "num_tokens": 611496599.0, + "step": 16029 + }, + { + "epoch": 2.039180765805877, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6371394395828247, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8886865377426147, + "num_tokens": 611533015.0, + "step": 16030 + }, + { + "epoch": 2.0393079760844675, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7022714614868164, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8817136287689209, + "num_tokens": 611570041.0, + "step": 16031 + }, + { + "epoch": 2.039435186363058, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 16.69716453552246, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8657717704772949, + "num_tokens": 611613598.0, + "step": 16032 + }, + { + "epoch": 2.0395623966416485, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.8445087671279907, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8850408792495728, + "num_tokens": 611641705.0, + "step": 16033 + }, + { + "epoch": 2.039689606920239, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.706495761871338, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8875529766082764, + "num_tokens": 611677856.0, + "step": 16034 + }, + { + "epoch": 2.0398168171988296, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7179386615753174, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8880733847618103, + "num_tokens": 611711479.0, + "step": 16035 + }, + { + "epoch": 2.03994402747742, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.643336296081543, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8666306734085083, + "num_tokens": 611751500.0, + "step": 16036 + }, + { + "epoch": 2.0400712377560106, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.6005133390426636, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8626381158828735, + "num_tokens": 611794887.0, + "step": 16037 + }, + { + "epoch": 2.040198448034601, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6447139978408813, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8803592920303345, + "num_tokens": 611832496.0, + "step": 16038 + }, + { + "epoch": 2.0403256583131917, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.585800051689148, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8593978881835938, + "num_tokens": 611875123.0, + "step": 16039 + }, + { + "epoch": 2.040452868591782, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.522524118423462, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8810948133468628, + "num_tokens": 611915402.0, + "step": 16040 + }, + { + "epoch": 2.0405800788703727, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6649107933044434, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8807740211486816, + "num_tokens": 611948766.0, + "step": 16041 + }, + { + "epoch": 2.0407072891489633, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.7221909761428833, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8836798667907715, + "num_tokens": 611981947.0, + "step": 16042 + }, + { + "epoch": 2.040834499427554, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.477628231048584, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8874989748001099, + "num_tokens": 612024311.0, + "step": 16043 + }, + { + "epoch": 2.0409617097061443, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.526853084564209, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8884983062744141, + "num_tokens": 612066257.0, + "step": 16044 + }, + { + "epoch": 2.041088919984735, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5897551774978638, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8759400248527527, + "num_tokens": 612104242.0, + "step": 16045 + }, + { + "epoch": 2.0412161302633254, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6484723091125488, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8762447834014893, + "num_tokens": 612141238.0, + "step": 16046 + }, + { + "epoch": 2.041343340541916, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6191045045852661, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8809571266174316, + "num_tokens": 612177721.0, + "step": 16047 + }, + { + "epoch": 2.0414705508205064, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.8199591636657715, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.877640426158905, + "num_tokens": 612212224.0, + "step": 16048 + }, + { + "epoch": 2.041597761099097, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6038881540298462, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8767843842506409, + "num_tokens": 612252833.0, + "step": 16049 + }, + { + "epoch": 2.0417249713776875, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5952502489089966, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8885003328323364, + "num_tokens": 612294871.0, + "step": 16050 + }, + { + "epoch": 2.041852181656278, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5404384136199951, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8899579048156738, + "num_tokens": 612333326.0, + "step": 16051 + }, + { + "epoch": 2.0419793919348685, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5845056772232056, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8797276020050049, + "num_tokens": 612373729.0, + "step": 16052 + }, + { + "epoch": 2.0421066022134586, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.766530990600586, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8751819133758545, + "num_tokens": 612408924.0, + "step": 16053 + }, + { + "epoch": 2.042233812492049, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.6600992679595947, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8780140280723572, + "num_tokens": 612449569.0, + "step": 16054 + }, + { + "epoch": 2.0423610227706397, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.567453145980835, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8881072998046875, + "num_tokens": 612489677.0, + "step": 16055 + }, + { + "epoch": 2.04248823304923, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.634789228439331, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8898485898971558, + "num_tokens": 612523746.0, + "step": 16056 + }, + { + "epoch": 2.0426154433278207, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.5746850967407227, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8749064207077026, + "num_tokens": 612562172.0, + "step": 16057 + }, + { + "epoch": 2.0427426536064113, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.6259939670562744, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.876669704914093, + "num_tokens": 612605556.0, + "step": 16058 + }, + { + "epoch": 2.042869863885002, + "ewc_loss": 2.562999725341797e-05, + "grad_norm": 1.673646330833435, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8732694983482361, + "num_tokens": 612643645.0, + "step": 16059 + }, + { + "epoch": 2.0429970741635923, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.6126885414123535, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.887561559677124, + "num_tokens": 612679360.0, + "step": 16060 + }, + { + "epoch": 2.043124284442183, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.7948743104934692, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8708620071411133, + "num_tokens": 612715770.0, + "step": 16061 + }, + { + "epoch": 2.0432514947207734, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.7936965227127075, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8718586564064026, + "num_tokens": 612752220.0, + "step": 16062 + }, + { + "epoch": 2.043378704999364, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.6079915761947632, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8821917772293091, + "num_tokens": 612790360.0, + "step": 16063 + }, + { + "epoch": 2.0435059152779544, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7044178247451782, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8787479996681213, + "num_tokens": 612827166.0, + "step": 16064 + }, + { + "epoch": 2.043633125556545, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.8756554126739502, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8676360845565796, + "num_tokens": 612862192.0, + "step": 16065 + }, + { + "epoch": 2.0437603358351355, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6221234798431396, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8874080181121826, + "num_tokens": 612898591.0, + "step": 16066 + }, + { + "epoch": 2.043887546113726, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.8968364000320435, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8664027452468872, + "num_tokens": 612929025.0, + "step": 16067 + }, + { + "epoch": 2.0440147563923166, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.9390931129455566, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8668698668479919, + "num_tokens": 612958077.0, + "step": 16068 + }, + { + "epoch": 2.044141966670907, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6254216432571411, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8721634149551392, + "num_tokens": 612995230.0, + "step": 16069 + }, + { + "epoch": 2.0442691769494976, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.7509409189224243, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8773243427276611, + "num_tokens": 613028359.0, + "step": 16070 + }, + { + "epoch": 2.044396387228088, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.6375727653503418, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8600882291793823, + "num_tokens": 613069624.0, + "step": 16071 + }, + { + "epoch": 2.0445235975066787, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6826534271240234, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8693804144859314, + "num_tokens": 613106502.0, + "step": 16072 + }, + { + "epoch": 2.044650807785269, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.6594372987747192, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8792824745178223, + "num_tokens": 613142370.0, + "step": 16073 + }, + { + "epoch": 2.0447780180638597, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.524544596672058, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8893550038337708, + "num_tokens": 613182255.0, + "step": 16074 + }, + { + "epoch": 2.0449052283424503, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6276828050613403, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8882204294204712, + "num_tokens": 613221153.0, + "step": 16075 + }, + { + "epoch": 2.045032438621041, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.5811891555786133, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.871105432510376, + "num_tokens": 613264952.0, + "step": 16076 + }, + { + "epoch": 2.045159648899631, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5591967105865479, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8576393723487854, + "num_tokens": 613311309.0, + "step": 16077 + }, + { + "epoch": 2.0452868591782214, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.6189780235290527, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8919389843940735, + "num_tokens": 613345571.0, + "step": 16078 + }, + { + "epoch": 2.045414069456812, + "ewc_loss": 2.574920654296875e-05, + "grad_norm": 1.834341287612915, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8705908060073853, + "num_tokens": 613380821.0, + "step": 16079 + }, + { + "epoch": 2.0455412797354025, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5462710857391357, + "learning_rate": 1e-06, + "loss": 0.3353, + "mean_token_accuracy": 0.8911025524139404, + "num_tokens": 613421097.0, + "step": 16080 + }, + { + "epoch": 2.045668490013993, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7043001651763916, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8687817454338074, + "num_tokens": 613459182.0, + "step": 16081 + }, + { + "epoch": 2.0457957002925835, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6857420206069946, + "learning_rate": 1e-06, + "loss": 0.307, + "mean_token_accuracy": 0.8975912928581238, + "num_tokens": 613490960.0, + "step": 16082 + }, + { + "epoch": 2.045922910571174, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6146787405014038, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8749721646308899, + "num_tokens": 613530494.0, + "step": 16083 + }, + { + "epoch": 2.0460501208497646, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.510031819343567, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8747967481613159, + "num_tokens": 613575814.0, + "step": 16084 + }, + { + "epoch": 2.046177331128355, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6018563508987427, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8773692846298218, + "num_tokens": 613615729.0, + "step": 16085 + }, + { + "epoch": 2.0463045414069456, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6056594848632812, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8774025440216064, + "num_tokens": 613656249.0, + "step": 16086 + }, + { + "epoch": 2.046431751685536, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6695302724838257, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.87103271484375, + "num_tokens": 613696570.0, + "step": 16087 + }, + { + "epoch": 2.0465589619641267, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.740209698677063, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.87139892578125, + "num_tokens": 613737873.0, + "step": 16088 + }, + { + "epoch": 2.046686172242717, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.853385329246521, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8688781261444092, + "num_tokens": 613768247.0, + "step": 16089 + }, + { + "epoch": 2.0468133825213077, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7135740518569946, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8864216208457947, + "num_tokens": 613802287.0, + "step": 16090 + }, + { + "epoch": 2.0469405927998983, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.688391923904419, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8828277587890625, + "num_tokens": 613840906.0, + "step": 16091 + }, + { + "epoch": 2.047067803078489, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.9416464567184448, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.864311933517456, + "num_tokens": 613877226.0, + "step": 16092 + }, + { + "epoch": 2.0471950133570793, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.4606655836105347, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8900968432426453, + "num_tokens": 613921080.0, + "step": 16093 + }, + { + "epoch": 2.04732222363567, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6361902952194214, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8791576027870178, + "num_tokens": 613954649.0, + "step": 16094 + }, + { + "epoch": 2.0474494339142604, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.62191641330719, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8770778179168701, + "num_tokens": 613993393.0, + "step": 16095 + }, + { + "epoch": 2.047576644192851, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6288527250289917, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8734086751937866, + "num_tokens": 614034365.0, + "step": 16096 + }, + { + "epoch": 2.0477038544714414, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.508368730545044, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8739537596702576, + "num_tokens": 614078040.0, + "step": 16097 + }, + { + "epoch": 2.047831064750032, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5394667387008667, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8891295194625854, + "num_tokens": 614117412.0, + "step": 16098 + }, + { + "epoch": 2.0479582750286225, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7978440523147583, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8553408980369568, + "num_tokens": 614153913.0, + "step": 16099 + }, + { + "epoch": 2.048085485307213, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6242212057113647, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8841164112091064, + "num_tokens": 614189834.0, + "step": 16100 + }, + { + "epoch": 2.0482126955858035, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5831389427185059, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8779394626617432, + "num_tokens": 614229545.0, + "step": 16101 + }, + { + "epoch": 2.0483399058643936, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5824507474899292, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8797159194946289, + "num_tokens": 614270304.0, + "step": 16102 + }, + { + "epoch": 2.048467116142984, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6021780967712402, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8851255774497986, + "num_tokens": 614309356.0, + "step": 16103 + }, + { + "epoch": 2.0485943264215747, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5550484657287598, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8726016283035278, + "num_tokens": 614353644.0, + "step": 16104 + }, + { + "epoch": 2.048721536700165, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.634035348892212, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8594995737075806, + "num_tokens": 614392767.0, + "step": 16105 + }, + { + "epoch": 2.0488487469787557, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5218604803085327, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8847952485084534, + "num_tokens": 614433873.0, + "step": 16106 + }, + { + "epoch": 2.0489759572573463, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.610364556312561, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8791767954826355, + "num_tokens": 614471624.0, + "step": 16107 + }, + { + "epoch": 2.049103167535937, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.4803510904312134, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8733455538749695, + "num_tokens": 614515161.0, + "step": 16108 + }, + { + "epoch": 2.0492303778145273, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5663402080535889, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8831610679626465, + "num_tokens": 614555259.0, + "step": 16109 + }, + { + "epoch": 2.049357588093118, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.543493390083313, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8704829216003418, + "num_tokens": 614598407.0, + "step": 16110 + }, + { + "epoch": 2.0494847983717084, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6135941743850708, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8767561912536621, + "num_tokens": 614635289.0, + "step": 16111 + }, + { + "epoch": 2.049612008650299, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6241716146469116, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8734833598136902, + "num_tokens": 614675496.0, + "step": 16112 + }, + { + "epoch": 2.0497392189288894, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6300106048583984, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8766087293624878, + "num_tokens": 614714904.0, + "step": 16113 + }, + { + "epoch": 2.04986642920748, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5184729099273682, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8872028589248657, + "num_tokens": 614754977.0, + "step": 16114 + }, + { + "epoch": 2.0499936394860705, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7380859851837158, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8651171922683716, + "num_tokens": 614792801.0, + "step": 16115 + }, + { + "epoch": 2.050120849764661, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.751433253288269, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8794570565223694, + "num_tokens": 614825274.0, + "step": 16116 + }, + { + "epoch": 2.0502480600432516, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6971421241760254, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8819893598556519, + "num_tokens": 614865225.0, + "step": 16117 + }, + { + "epoch": 2.050375270321842, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7718405723571777, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8526425361633301, + "num_tokens": 614904783.0, + "step": 16118 + }, + { + "epoch": 2.0505024806004326, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6371194124221802, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8660694360733032, + "num_tokens": 614942282.0, + "step": 16119 + }, + { + "epoch": 2.050629690879023, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6095554828643799, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8757873773574829, + "num_tokens": 614981375.0, + "step": 16120 + }, + { + "epoch": 2.0507569011576137, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6980401277542114, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8653454184532166, + "num_tokens": 615022873.0, + "step": 16121 + }, + { + "epoch": 2.050884111436204, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6397762298583984, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8710289001464844, + "num_tokens": 615064148.0, + "step": 16122 + }, + { + "epoch": 2.0510113217147947, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6469587087631226, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8799899816513062, + "num_tokens": 615101881.0, + "step": 16123 + }, + { + "epoch": 2.0511385319933853, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7409058809280396, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8808739185333252, + "num_tokens": 615133414.0, + "step": 16124 + }, + { + "epoch": 2.051265742271976, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6629773378372192, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8835898637771606, + "num_tokens": 615169437.0, + "step": 16125 + }, + { + "epoch": 2.0513929525505663, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6066322326660156, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8690379858016968, + "num_tokens": 615207736.0, + "step": 16126 + }, + { + "epoch": 2.0515201628291564, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.719450831413269, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8736063241958618, + "num_tokens": 615246240.0, + "step": 16127 + }, + { + "epoch": 2.051647373107747, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7099034786224365, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.878066897392273, + "num_tokens": 615284063.0, + "step": 16128 + }, + { + "epoch": 2.0517745833863374, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.616896390914917, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8658910989761353, + "num_tokens": 615328876.0, + "step": 16129 + }, + { + "epoch": 2.051901793664928, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7333433628082275, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8799443244934082, + "num_tokens": 615362178.0, + "step": 16130 + }, + { + "epoch": 2.0520290039435185, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.716042160987854, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8699913024902344, + "num_tokens": 615400826.0, + "step": 16131 + }, + { + "epoch": 2.052156214222109, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.745104193687439, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8679394721984863, + "num_tokens": 615436056.0, + "step": 16132 + }, + { + "epoch": 2.0522834245006996, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7083766460418701, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8921197056770325, + "num_tokens": 615473271.0, + "step": 16133 + }, + { + "epoch": 2.05241063477929, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6431841850280762, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8670160174369812, + "num_tokens": 615509974.0, + "step": 16134 + }, + { + "epoch": 2.0525378450578806, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 3.6676509380340576, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8632807731628418, + "num_tokens": 615550321.0, + "step": 16135 + }, + { + "epoch": 2.052665055336471, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6902333498001099, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8805856704711914, + "num_tokens": 615588447.0, + "step": 16136 + }, + { + "epoch": 2.0527922656150617, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6271530389785767, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.889659583568573, + "num_tokens": 615624124.0, + "step": 16137 + }, + { + "epoch": 2.052919475893652, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6044749021530151, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8788985013961792, + "num_tokens": 615665145.0, + "step": 16138 + }, + { + "epoch": 2.0530466861722427, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5821999311447144, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8750543594360352, + "num_tokens": 615705697.0, + "step": 16139 + }, + { + "epoch": 2.0531738964508333, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5869741439819336, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8837540149688721, + "num_tokens": 615742410.0, + "step": 16140 + }, + { + "epoch": 2.053301106729424, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.686644196510315, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.871405303478241, + "num_tokens": 615778670.0, + "step": 16141 + }, + { + "epoch": 2.0534283170080143, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.572767734527588, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8939132690429688, + "num_tokens": 615820005.0, + "step": 16142 + }, + { + "epoch": 2.053555527286605, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6375651359558105, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8757872581481934, + "num_tokens": 615857808.0, + "step": 16143 + }, + { + "epoch": 2.0536827375651954, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.607607364654541, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8813339471817017, + "num_tokens": 615897632.0, + "step": 16144 + }, + { + "epoch": 2.053809947843786, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7823151350021362, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8750205039978027, + "num_tokens": 615931335.0, + "step": 16145 + }, + { + "epoch": 2.0539371581223764, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5637621879577637, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8893473148345947, + "num_tokens": 615969679.0, + "step": 16146 + }, + { + "epoch": 2.054064368400967, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6614123582839966, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8785059452056885, + "num_tokens": 616006697.0, + "step": 16147 + }, + { + "epoch": 2.0541915786795575, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6158545017242432, + "learning_rate": 1e-06, + "loss": 0.3231, + "mean_token_accuracy": 0.8927910923957825, + "num_tokens": 616042174.0, + "step": 16148 + }, + { + "epoch": 2.054318788958148, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7569457292556763, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8723536729812622, + "num_tokens": 616075575.0, + "step": 16149 + }, + { + "epoch": 2.0544459992367385, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7149933576583862, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8793261051177979, + "num_tokens": 616110421.0, + "step": 16150 + }, + { + "epoch": 2.0545732095153286, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6175211668014526, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8823076486587524, + "num_tokens": 616146557.0, + "step": 16151 + }, + { + "epoch": 2.054700419793919, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.464229702949524, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8760676383972168, + "num_tokens": 616188879.0, + "step": 16152 + }, + { + "epoch": 2.0548276300725097, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.657056450843811, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8748766779899597, + "num_tokens": 616226283.0, + "step": 16153 + }, + { + "epoch": 2.0549548403511, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6162083148956299, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8774531483650208, + "num_tokens": 616262951.0, + "step": 16154 + }, + { + "epoch": 2.0550820506296907, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.8428964614868164, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8629013299942017, + "num_tokens": 616298042.0, + "step": 16155 + }, + { + "epoch": 2.0552092609082813, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6040784120559692, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8878998160362244, + "num_tokens": 616335642.0, + "step": 16156 + }, + { + "epoch": 2.055336471186872, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7311978340148926, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8637843132019043, + "num_tokens": 616370194.0, + "step": 16157 + }, + { + "epoch": 2.0554636814654623, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7091461420059204, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.883437991142273, + "num_tokens": 616408151.0, + "step": 16158 + }, + { + "epoch": 2.055590891744053, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.4598714113235474, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8822218179702759, + "num_tokens": 616455049.0, + "step": 16159 + }, + { + "epoch": 2.0557181020226434, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7593084573745728, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8696529269218445, + "num_tokens": 616492790.0, + "step": 16160 + }, + { + "epoch": 2.055845312301234, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.748883843421936, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8629072904586792, + "num_tokens": 616529837.0, + "step": 16161 + }, + { + "epoch": 2.0559725225798244, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.531385898590088, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.886368989944458, + "num_tokens": 616568897.0, + "step": 16162 + }, + { + "epoch": 2.056099732858415, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6003233194351196, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.876350998878479, + "num_tokens": 616606477.0, + "step": 16163 + }, + { + "epoch": 2.0562269431370055, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5790444612503052, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8946470618247986, + "num_tokens": 616643133.0, + "step": 16164 + }, + { + "epoch": 2.056354153415596, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7764812707901, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8750952482223511, + "num_tokens": 616673324.0, + "step": 16165 + }, + { + "epoch": 2.0564813636941865, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7450976371765137, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8695354461669922, + "num_tokens": 616707286.0, + "step": 16166 + }, + { + "epoch": 2.056608573972777, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5704697370529175, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8808655738830566, + "num_tokens": 616749459.0, + "step": 16167 + }, + { + "epoch": 2.0567357842513676, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.548257827758789, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8666172623634338, + "num_tokens": 616794995.0, + "step": 16168 + }, + { + "epoch": 2.056862994529958, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.600401759147644, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8863826394081116, + "num_tokens": 616834624.0, + "step": 16169 + }, + { + "epoch": 2.0569902048085487, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6653891801834106, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8784398436546326, + "num_tokens": 616871340.0, + "step": 16170 + }, + { + "epoch": 2.057117415087139, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7736990451812744, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8724768161773682, + "num_tokens": 616906880.0, + "step": 16171 + }, + { + "epoch": 2.0572446253657297, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7030264139175415, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.876534104347229, + "num_tokens": 616943441.0, + "step": 16172 + }, + { + "epoch": 2.0573718356443202, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5728763341903687, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.87690669298172, + "num_tokens": 616983157.0, + "step": 16173 + }, + { + "epoch": 2.0574990459229108, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7548826932907104, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8689109086990356, + "num_tokens": 617021133.0, + "step": 16174 + }, + { + "epoch": 2.057626256201501, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6086729764938354, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8813784122467041, + "num_tokens": 617058741.0, + "step": 16175 + }, + { + "epoch": 2.0577534664800914, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7707728147506714, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8591655492782593, + "num_tokens": 617097179.0, + "step": 16176 + }, + { + "epoch": 2.057880676758682, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5700609683990479, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.881911039352417, + "num_tokens": 617137667.0, + "step": 16177 + }, + { + "epoch": 2.0580078870372724, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5484347343444824, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8870202302932739, + "num_tokens": 617178972.0, + "step": 16178 + }, + { + "epoch": 2.058135097315863, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.7569986581802368, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.878731906414032, + "num_tokens": 617215039.0, + "step": 16179 + }, + { + "epoch": 2.0582623075944535, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6248589754104614, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8688170909881592, + "num_tokens": 617255495.0, + "step": 16180 + }, + { + "epoch": 2.058389517873044, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6581493616104126, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8688912987709045, + "num_tokens": 617294600.0, + "step": 16181 + }, + { + "epoch": 2.0585167281516346, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.5966752767562866, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8750936388969421, + "num_tokens": 617332842.0, + "step": 16182 + }, + { + "epoch": 2.058643938430225, + "ewc_loss": 2.586841583251953e-05, + "grad_norm": 1.6055017709732056, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8811363577842712, + "num_tokens": 617371831.0, + "step": 16183 + }, + { + "epoch": 2.0587711487088156, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6871263980865479, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8734028339385986, + "num_tokens": 617409256.0, + "step": 16184 + }, + { + "epoch": 2.058898358987406, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7120503187179565, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.88534015417099, + "num_tokens": 617444963.0, + "step": 16185 + }, + { + "epoch": 2.0590255692659967, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5262631177902222, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8886224031448364, + "num_tokens": 617489030.0, + "step": 16186 + }, + { + "epoch": 2.059152779544587, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6708095073699951, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8741062879562378, + "num_tokens": 617526136.0, + "step": 16187 + }, + { + "epoch": 2.0592799898231777, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7991654872894287, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8638587594032288, + "num_tokens": 617564245.0, + "step": 16188 + }, + { + "epoch": 2.0594072001017683, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6978644132614136, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8829628229141235, + "num_tokens": 617600739.0, + "step": 16189 + }, + { + "epoch": 2.059534410380359, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5718568563461304, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8864486217498779, + "num_tokens": 617636707.0, + "step": 16190 + }, + { + "epoch": 2.0596616206589493, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.665850281715393, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8752537965774536, + "num_tokens": 617672607.0, + "step": 16191 + }, + { + "epoch": 2.05978883093754, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5190153121948242, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8860493898391724, + "num_tokens": 617711854.0, + "step": 16192 + }, + { + "epoch": 2.0599160412161304, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7395745515823364, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8716566562652588, + "num_tokens": 617745299.0, + "step": 16193 + }, + { + "epoch": 2.060043251494721, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.872590184211731, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8659384250640869, + "num_tokens": 617775714.0, + "step": 16194 + }, + { + "epoch": 2.0601704617733114, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.673583984375, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8709452152252197, + "num_tokens": 617812566.0, + "step": 16195 + }, + { + "epoch": 2.060297672051902, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.8426220417022705, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8774474859237671, + "num_tokens": 617844248.0, + "step": 16196 + }, + { + "epoch": 2.0604248823304925, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.704473853111267, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8799178600311279, + "num_tokens": 617877448.0, + "step": 16197 + }, + { + "epoch": 2.060552092609083, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6834648847579956, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8869815468788147, + "num_tokens": 617912051.0, + "step": 16198 + }, + { + "epoch": 2.0606793028876735, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7913182973861694, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8624932765960693, + "num_tokens": 617950287.0, + "step": 16199 + }, + { + "epoch": 2.0608065131662636, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5180139541625977, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8775918483734131, + "num_tokens": 617993778.0, + "step": 16200 + }, + { + "epoch": 2.060933723444854, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7220468521118164, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.862672746181488, + "num_tokens": 618032157.0, + "step": 16201 + }, + { + "epoch": 2.0610609337234447, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6407026052474976, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8725426197052002, + "num_tokens": 618073696.0, + "step": 16202 + }, + { + "epoch": 2.061188144002035, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.591234803199768, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8701232671737671, + "num_tokens": 618115360.0, + "step": 16203 + }, + { + "epoch": 2.0613153542806257, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7083125114440918, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8726456165313721, + "num_tokens": 618153774.0, + "step": 16204 + }, + { + "epoch": 2.0614425645592163, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6854088306427002, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8677000403404236, + "num_tokens": 618193875.0, + "step": 16205 + }, + { + "epoch": 2.061569774837807, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.904278039932251, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8635890483856201, + "num_tokens": 618224280.0, + "step": 16206 + }, + { + "epoch": 2.0616969851163973, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6061553955078125, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8842113018035889, + "num_tokens": 618260366.0, + "step": 16207 + }, + { + "epoch": 2.061824195394988, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5411185026168823, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8801330327987671, + "num_tokens": 618301467.0, + "step": 16208 + }, + { + "epoch": 2.0619514056735784, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7094639539718628, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8703175187110901, + "num_tokens": 618341075.0, + "step": 16209 + }, + { + "epoch": 2.062078615952169, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5140265226364136, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8846670389175415, + "num_tokens": 618383623.0, + "step": 16210 + }, + { + "epoch": 2.0622058262307594, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.551853895187378, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8736185431480408, + "num_tokens": 618428068.0, + "step": 16211 + }, + { + "epoch": 2.06233303650935, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7521376609802246, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8777793645858765, + "num_tokens": 618460959.0, + "step": 16212 + }, + { + "epoch": 2.0624602467879405, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7069227695465088, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8712272644042969, + "num_tokens": 618502414.0, + "step": 16213 + }, + { + "epoch": 2.062587457066531, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5723334550857544, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.889042317867279, + "num_tokens": 618540318.0, + "step": 16214 + }, + { + "epoch": 2.0627146673451215, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.609106421470642, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8698720932006836, + "num_tokens": 618582165.0, + "step": 16215 + }, + { + "epoch": 2.062841877623712, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6731750965118408, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8774880170822144, + "num_tokens": 618623302.0, + "step": 16216 + }, + { + "epoch": 2.0629690879023026, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7007052898406982, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8842602372169495, + "num_tokens": 618658382.0, + "step": 16217 + }, + { + "epoch": 2.063096298180893, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7059621810913086, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.882225751876831, + "num_tokens": 618695755.0, + "step": 16218 + }, + { + "epoch": 2.0632235084594837, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.652451515197754, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8792296648025513, + "num_tokens": 618730569.0, + "step": 16219 + }, + { + "epoch": 2.063350718738074, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.8272901773452759, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8517433404922485, + "num_tokens": 618762435.0, + "step": 16220 + }, + { + "epoch": 2.0634779290166647, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.703568458557129, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8774670362472534, + "num_tokens": 618798472.0, + "step": 16221 + }, + { + "epoch": 2.0636051392952552, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7087516784667969, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8653936386108398, + "num_tokens": 618835617.0, + "step": 16222 + }, + { + "epoch": 2.0637323495738458, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.650172233581543, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8736034631729126, + "num_tokens": 618872604.0, + "step": 16223 + }, + { + "epoch": 2.0638595598524363, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5225348472595215, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8726513385772705, + "num_tokens": 618917171.0, + "step": 16224 + }, + { + "epoch": 2.0639867701310264, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.684101939201355, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8756915330886841, + "num_tokens": 618952967.0, + "step": 16225 + }, + { + "epoch": 2.064113980409617, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5781276226043701, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8814266920089722, + "num_tokens": 618989980.0, + "step": 16226 + }, + { + "epoch": 2.0642411906882074, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5922338962554932, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8802186846733093, + "num_tokens": 619028099.0, + "step": 16227 + }, + { + "epoch": 2.064368400966798, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5990760326385498, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.862641453742981, + "num_tokens": 619069899.0, + "step": 16228 + }, + { + "epoch": 2.0644956112453885, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6700835227966309, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8714859485626221, + "num_tokens": 619109588.0, + "step": 16229 + }, + { + "epoch": 2.064622821523979, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6599441766738892, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8661742210388184, + "num_tokens": 619148156.0, + "step": 16230 + }, + { + "epoch": 2.0647500318025696, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.8077939748764038, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8680995106697083, + "num_tokens": 619181414.0, + "step": 16231 + }, + { + "epoch": 2.06487724208116, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.5822221040725708, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8654043674468994, + "num_tokens": 619224616.0, + "step": 16232 + }, + { + "epoch": 2.0650044523597506, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7193573713302612, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8682627081871033, + "num_tokens": 619263742.0, + "step": 16233 + }, + { + "epoch": 2.065131662638341, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7530149221420288, + "learning_rate": 1e-06, + "loss": 0.3303, + "mean_token_accuracy": 0.8922154903411865, + "num_tokens": 619304349.0, + "step": 16234 + }, + { + "epoch": 2.0652588729169317, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7141799926757812, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8752470016479492, + "num_tokens": 619340272.0, + "step": 16235 + }, + { + "epoch": 2.065386083195522, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6194305419921875, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8793382048606873, + "num_tokens": 619378474.0, + "step": 16236 + }, + { + "epoch": 2.0655132934741127, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.7368391752243042, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8900079727172852, + "num_tokens": 619408255.0, + "step": 16237 + }, + { + "epoch": 2.0656405037527032, + "ewc_loss": 2.5987625122070312e-05, + "grad_norm": 1.6532633304595947, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.880314290523529, + "num_tokens": 619443655.0, + "step": 16238 + }, + { + "epoch": 2.065767714031294, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.775838851928711, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8638335466384888, + "num_tokens": 619482280.0, + "step": 16239 + }, + { + "epoch": 2.0658949243098843, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6051058769226074, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8744871616363525, + "num_tokens": 619522197.0, + "step": 16240 + }, + { + "epoch": 2.066022134588475, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7403706312179565, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.872139573097229, + "num_tokens": 619558554.0, + "step": 16241 + }, + { + "epoch": 2.0661493448670654, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5789134502410889, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8823492527008057, + "num_tokens": 619598909.0, + "step": 16242 + }, + { + "epoch": 2.066276555145656, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8309634923934937, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8681869506835938, + "num_tokens": 619633825.0, + "step": 16243 + }, + { + "epoch": 2.0664037654242464, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5439553260803223, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8612157702445984, + "num_tokens": 619679679.0, + "step": 16244 + }, + { + "epoch": 2.066530975702837, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8306430578231812, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8562227487564087, + "num_tokens": 619713471.0, + "step": 16245 + }, + { + "epoch": 2.0666581859814275, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6203604936599731, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8930191993713379, + "num_tokens": 619752451.0, + "step": 16246 + }, + { + "epoch": 2.066785396260018, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.687370777130127, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8597837686538696, + "num_tokens": 619793197.0, + "step": 16247 + }, + { + "epoch": 2.0669126065386085, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5573056936264038, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8922563195228577, + "num_tokens": 619831340.0, + "step": 16248 + }, + { + "epoch": 2.0670398168171986, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5390100479125977, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8668876886367798, + "num_tokens": 619873096.0, + "step": 16249 + }, + { + "epoch": 2.067167027095789, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7262879610061646, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8875932097434998, + "num_tokens": 619905944.0, + "step": 16250 + }, + { + "epoch": 2.0672942373743797, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7853152751922607, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8716003894805908, + "num_tokens": 619940519.0, + "step": 16251 + }, + { + "epoch": 2.06742144765297, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8047561645507812, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8690645694732666, + "num_tokens": 619977639.0, + "step": 16252 + }, + { + "epoch": 2.0675486579315607, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5380995273590088, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8950961828231812, + "num_tokens": 620015121.0, + "step": 16253 + }, + { + "epoch": 2.0676758682101513, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.656580924987793, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8773728609085083, + "num_tokens": 620052134.0, + "step": 16254 + }, + { + "epoch": 2.067803078488742, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 2.1547558307647705, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8686804175376892, + "num_tokens": 620089618.0, + "step": 16255 + }, + { + "epoch": 2.0679302887673323, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5355223417282104, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8761811256408691, + "num_tokens": 620134180.0, + "step": 16256 + }, + { + "epoch": 2.068057499045923, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7129215002059937, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8777949213981628, + "num_tokens": 620167853.0, + "step": 16257 + }, + { + "epoch": 2.0681847093245134, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7880929708480835, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8876632452011108, + "num_tokens": 620198174.0, + "step": 16258 + }, + { + "epoch": 2.068311919603104, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6666882038116455, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8712568283081055, + "num_tokens": 620243236.0, + "step": 16259 + }, + { + "epoch": 2.0684391298816944, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.805987000465393, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8499038219451904, + "num_tokens": 620279440.0, + "step": 16260 + }, + { + "epoch": 2.068566340160285, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6294341087341309, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8836364150047302, + "num_tokens": 620316977.0, + "step": 16261 + }, + { + "epoch": 2.0686935504388755, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7077549695968628, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8621958494186401, + "num_tokens": 620354797.0, + "step": 16262 + }, + { + "epoch": 2.068820760717466, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.633837103843689, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8708986639976501, + "num_tokens": 620392885.0, + "step": 16263 + }, + { + "epoch": 2.0689479709960565, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8168619871139526, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8787700533866882, + "num_tokens": 620430910.0, + "step": 16264 + }, + { + "epoch": 2.069075181274647, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7873103618621826, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8628479242324829, + "num_tokens": 620469435.0, + "step": 16265 + }, + { + "epoch": 2.0692023915532376, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8177350759506226, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8711965680122375, + "num_tokens": 620502146.0, + "step": 16266 + }, + { + "epoch": 2.069329601831828, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6409422159194946, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8766857385635376, + "num_tokens": 620540717.0, + "step": 16267 + }, + { + "epoch": 2.0694568121104187, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5758899450302124, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8789342045783997, + "num_tokens": 620585649.0, + "step": 16268 + }, + { + "epoch": 2.069584022389009, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 3.6998908519744873, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8838373422622681, + "num_tokens": 620619744.0, + "step": 16269 + }, + { + "epoch": 2.0697112326675997, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6686285734176636, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8639591336250305, + "num_tokens": 620658867.0, + "step": 16270 + }, + { + "epoch": 2.0698384429461902, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 2.0102217197418213, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8692891001701355, + "num_tokens": 620686652.0, + "step": 16271 + }, + { + "epoch": 2.0699656532247808, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.4831801652908325, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8824748396873474, + "num_tokens": 620729621.0, + "step": 16272 + }, + { + "epoch": 2.070092863503371, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.82315194606781, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8743811845779419, + "num_tokens": 620761783.0, + "step": 16273 + }, + { + "epoch": 2.0702200737819614, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7040783166885376, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8682760000228882, + "num_tokens": 620798521.0, + "step": 16274 + }, + { + "epoch": 2.070347284060552, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.803236484527588, + "learning_rate": 1e-06, + "loss": 0.3302, + "mean_token_accuracy": 0.89222651720047, + "num_tokens": 620828482.0, + "step": 16275 + }, + { + "epoch": 2.0704744943391424, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6138001680374146, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8841685056686401, + "num_tokens": 620864405.0, + "step": 16276 + }, + { + "epoch": 2.070601704617733, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5329258441925049, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8863394260406494, + "num_tokens": 620902972.0, + "step": 16277 + }, + { + "epoch": 2.0707289148963235, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.789973497390747, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.879002571105957, + "num_tokens": 620936443.0, + "step": 16278 + }, + { + "epoch": 2.070856125174914, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.819188117980957, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8702402114868164, + "num_tokens": 620966595.0, + "step": 16279 + }, + { + "epoch": 2.0709833354535045, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6123675107955933, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8828938603401184, + "num_tokens": 621007633.0, + "step": 16280 + }, + { + "epoch": 2.071110545732095, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7773712873458862, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8706023693084717, + "num_tokens": 621039909.0, + "step": 16281 + }, + { + "epoch": 2.0712377560106856, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5874966382980347, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8711310625076294, + "num_tokens": 621081205.0, + "step": 16282 + }, + { + "epoch": 2.071364966289276, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8072829246520996, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8736986517906189, + "num_tokens": 621113805.0, + "step": 16283 + }, + { + "epoch": 2.0714921765678667, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.654213309288025, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8773938417434692, + "num_tokens": 621155084.0, + "step": 16284 + }, + { + "epoch": 2.071619386846457, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.626773715019226, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8738808035850525, + "num_tokens": 621193852.0, + "step": 16285 + }, + { + "epoch": 2.0717465971250477, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7162718772888184, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8575854897499084, + "num_tokens": 621233020.0, + "step": 16286 + }, + { + "epoch": 2.0718738074036382, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6637028455734253, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8736973404884338, + "num_tokens": 621271841.0, + "step": 16287 + }, + { + "epoch": 2.0720010176822288, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.577032208442688, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8812217712402344, + "num_tokens": 621313150.0, + "step": 16288 + }, + { + "epoch": 2.0721282279608193, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6000277996063232, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8790701627731323, + "num_tokens": 621353165.0, + "step": 16289 + }, + { + "epoch": 2.07225543823941, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5505340099334717, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.878419816493988, + "num_tokens": 621396259.0, + "step": 16290 + }, + { + "epoch": 2.0723826485180004, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5982648134231567, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8891849517822266, + "num_tokens": 621435213.0, + "step": 16291 + }, + { + "epoch": 2.072509858796591, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.911191701889038, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8803311586380005, + "num_tokens": 621465715.0, + "step": 16292 + }, + { + "epoch": 2.0726370690751814, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.582383632659912, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8855348229408264, + "num_tokens": 621501263.0, + "step": 16293 + }, + { + "epoch": 2.072764279353772, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7429203987121582, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8733577728271484, + "num_tokens": 621538112.0, + "step": 16294 + }, + { + "epoch": 2.0728914896323625, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5853627920150757, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8769824504852295, + "num_tokens": 621577320.0, + "step": 16295 + }, + { + "epoch": 2.073018699910953, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7459372282028198, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8756767511367798, + "num_tokens": 621613959.0, + "step": 16296 + }, + { + "epoch": 2.0731459101895435, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.758893609046936, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8696391582489014, + "num_tokens": 621648589.0, + "step": 16297 + }, + { + "epoch": 2.0732731204681336, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6639972925186157, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8590436577796936, + "num_tokens": 621690080.0, + "step": 16298 + }, + { + "epoch": 2.073400330746724, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8253008127212524, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.875410795211792, + "num_tokens": 621722754.0, + "step": 16299 + }, + { + "epoch": 2.0735275410253147, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6781337261199951, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8602681159973145, + "num_tokens": 621759980.0, + "step": 16300 + }, + { + "epoch": 2.073654751303905, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7024109363555908, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8619611859321594, + "num_tokens": 621800978.0, + "step": 16301 + }, + { + "epoch": 2.0737819615824957, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.58705472946167, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8880382180213928, + "num_tokens": 621836644.0, + "step": 16302 + }, + { + "epoch": 2.0739091718610863, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7444566488265991, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8849093914031982, + "num_tokens": 621866826.0, + "step": 16303 + }, + { + "epoch": 2.074036382139677, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6876028776168823, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8783286213874817, + "num_tokens": 621902882.0, + "step": 16304 + }, + { + "epoch": 2.0741635924182673, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 3.799447536468506, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8708332777023315, + "num_tokens": 621935170.0, + "step": 16305 + }, + { + "epoch": 2.074290802696858, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5565935373306274, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8867404460906982, + "num_tokens": 621975052.0, + "step": 16306 + }, + { + "epoch": 2.0744180129754484, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6139426231384277, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8769893050193787, + "num_tokens": 622014931.0, + "step": 16307 + }, + { + "epoch": 2.074545223254039, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7346571683883667, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8714755773544312, + "num_tokens": 622054161.0, + "step": 16308 + }, + { + "epoch": 2.0746724335326294, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6970800161361694, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8797942399978638, + "num_tokens": 622089379.0, + "step": 16309 + }, + { + "epoch": 2.07479964381122, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6798642873764038, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.874239444732666, + "num_tokens": 622125686.0, + "step": 16310 + }, + { + "epoch": 2.0749268540898105, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6650633811950684, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8869394659996033, + "num_tokens": 622162257.0, + "step": 16311 + }, + { + "epoch": 2.075054064368401, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.632808804512024, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8832703232765198, + "num_tokens": 622199969.0, + "step": 16312 + }, + { + "epoch": 2.0751812746469915, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5127023458480835, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8755617141723633, + "num_tokens": 622244402.0, + "step": 16313 + }, + { + "epoch": 2.075308484925582, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.4753862619400024, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8837910890579224, + "num_tokens": 622287572.0, + "step": 16314 + }, + { + "epoch": 2.0754356952041726, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5711395740509033, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8712273836135864, + "num_tokens": 622329104.0, + "step": 16315 + }, + { + "epoch": 2.075562905482763, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6736091375350952, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8759167194366455, + "num_tokens": 622367164.0, + "step": 16316 + }, + { + "epoch": 2.0756901157613536, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.738982915878296, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8700324296951294, + "num_tokens": 622401831.0, + "step": 16317 + }, + { + "epoch": 2.075817326039944, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5486338138580322, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8800563812255859, + "num_tokens": 622446217.0, + "step": 16318 + }, + { + "epoch": 2.0759445363185347, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5793107748031616, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8837161064147949, + "num_tokens": 622482579.0, + "step": 16319 + }, + { + "epoch": 2.0760717465971252, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7162470817565918, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8816558122634888, + "num_tokens": 622521654.0, + "step": 16320 + }, + { + "epoch": 2.0761989568757158, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7812480926513672, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8786578178405762, + "num_tokens": 622556821.0, + "step": 16321 + }, + { + "epoch": 2.0763261671543063, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6566110849380493, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8587537407875061, + "num_tokens": 622595420.0, + "step": 16322 + }, + { + "epoch": 2.0764533774328964, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5001736879348755, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8789102435112, + "num_tokens": 622643383.0, + "step": 16323 + }, + { + "epoch": 2.076580587711487, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6462376117706299, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8769590854644775, + "num_tokens": 622679780.0, + "step": 16324 + }, + { + "epoch": 2.0767077979900774, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.770728349685669, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8740455508232117, + "num_tokens": 622712557.0, + "step": 16325 + }, + { + "epoch": 2.076835008268668, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5552129745483398, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8824905157089233, + "num_tokens": 622752709.0, + "step": 16326 + }, + { + "epoch": 2.0769622185472585, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7053536176681519, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8800698518753052, + "num_tokens": 622790912.0, + "step": 16327 + }, + { + "epoch": 2.077089428825849, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7555158138275146, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8612183332443237, + "num_tokens": 622824913.0, + "step": 16328 + }, + { + "epoch": 2.0772166391044395, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6302739381790161, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8682829141616821, + "num_tokens": 622865954.0, + "step": 16329 + }, + { + "epoch": 2.07734384938303, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6331429481506348, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8781735897064209, + "num_tokens": 622901392.0, + "step": 16330 + }, + { + "epoch": 2.0774710596616206, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.586567997932434, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.883441686630249, + "num_tokens": 622941506.0, + "step": 16331 + }, + { + "epoch": 2.077598269940211, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.760933756828308, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.873515784740448, + "num_tokens": 622976969.0, + "step": 16332 + }, + { + "epoch": 2.0777254802188017, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6973830461502075, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8714802265167236, + "num_tokens": 623014818.0, + "step": 16333 + }, + { + "epoch": 2.077852690497392, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5983824729919434, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8801037669181824, + "num_tokens": 623055186.0, + "step": 16334 + }, + { + "epoch": 2.0779799007759827, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7701834440231323, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8604611158370972, + "num_tokens": 623088713.0, + "step": 16335 + }, + { + "epoch": 2.0781071110545732, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6341620683670044, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8880430459976196, + "num_tokens": 623124674.0, + "step": 16336 + }, + { + "epoch": 2.0782343213331638, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6748474836349487, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8815728425979614, + "num_tokens": 623161713.0, + "step": 16337 + }, + { + "epoch": 2.0783615316117543, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5484592914581299, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8873430490493774, + "num_tokens": 623201791.0, + "step": 16338 + }, + { + "epoch": 2.078488741890345, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6185781955718994, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8697747588157654, + "num_tokens": 623240252.0, + "step": 16339 + }, + { + "epoch": 2.0786159521689354, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5632140636444092, + "learning_rate": 1e-06, + "loss": 0.3234, + "mean_token_accuracy": 0.8903030157089233, + "num_tokens": 623279882.0, + "step": 16340 + }, + { + "epoch": 2.078743162447526, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7171638011932373, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8609583377838135, + "num_tokens": 623315792.0, + "step": 16341 + }, + { + "epoch": 2.0788703727261164, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.761683702468872, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.874491274356842, + "num_tokens": 623350280.0, + "step": 16342 + }, + { + "epoch": 2.078997583004707, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7803797721862793, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8654768466949463, + "num_tokens": 623390348.0, + "step": 16343 + }, + { + "epoch": 2.0791247932832975, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.604915976524353, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8865135312080383, + "num_tokens": 623430596.0, + "step": 16344 + }, + { + "epoch": 2.079252003561888, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7196842432022095, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8813797235488892, + "num_tokens": 623468341.0, + "step": 16345 + }, + { + "epoch": 2.0793792138404785, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7603176832199097, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8814202547073364, + "num_tokens": 623501875.0, + "step": 16346 + }, + { + "epoch": 2.0795064241190686, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.781536340713501, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8763633370399475, + "num_tokens": 623542408.0, + "step": 16347 + }, + { + "epoch": 2.079633634397659, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.614524483680725, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.876105785369873, + "num_tokens": 623583243.0, + "step": 16348 + }, + { + "epoch": 2.0797608446762497, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6403390169143677, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8660768270492554, + "num_tokens": 623626461.0, + "step": 16349 + }, + { + "epoch": 2.07988805495484, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5849305391311646, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8972706198692322, + "num_tokens": 623660946.0, + "step": 16350 + }, + { + "epoch": 2.0800152652334307, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8031790256500244, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8644247055053711, + "num_tokens": 623697046.0, + "step": 16351 + }, + { + "epoch": 2.0801424755120212, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.54021155834198, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8718194961547852, + "num_tokens": 623744063.0, + "step": 16352 + }, + { + "epoch": 2.0802696857906118, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.611878752708435, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8783119916915894, + "num_tokens": 623785840.0, + "step": 16353 + }, + { + "epoch": 2.0803968960692023, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6376925706863403, + "learning_rate": 1e-06, + "loss": 0.3168, + "mean_token_accuracy": 0.8938724994659424, + "num_tokens": 623819837.0, + "step": 16354 + }, + { + "epoch": 2.080524106347793, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6202925443649292, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8740754127502441, + "num_tokens": 623860215.0, + "step": 16355 + }, + { + "epoch": 2.0806513166263834, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.459031581878662, + "learning_rate": 1e-06, + "loss": 0.328, + "mean_token_accuracy": 0.8926844596862793, + "num_tokens": 623901477.0, + "step": 16356 + }, + { + "epoch": 2.080778526904974, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.797906517982483, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8812148571014404, + "num_tokens": 623932174.0, + "step": 16357 + }, + { + "epoch": 2.0809057371835644, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7955580949783325, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8770321011543274, + "num_tokens": 623965124.0, + "step": 16358 + }, + { + "epoch": 2.081032947462155, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7121163606643677, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8865776062011719, + "num_tokens": 624000722.0, + "step": 16359 + }, + { + "epoch": 2.0811601577407455, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6738622188568115, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8762385845184326, + "num_tokens": 624039214.0, + "step": 16360 + }, + { + "epoch": 2.081287368019336, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5620754957199097, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8778417110443115, + "num_tokens": 624078897.0, + "step": 16361 + }, + { + "epoch": 2.0814145782979265, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.4818633794784546, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8828163146972656, + "num_tokens": 624122459.0, + "step": 16362 + }, + { + "epoch": 2.081541788576517, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6866573095321655, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8733125925064087, + "num_tokens": 624162564.0, + "step": 16363 + }, + { + "epoch": 2.0816689988551076, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5682958364486694, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.88282710313797, + "num_tokens": 624203586.0, + "step": 16364 + }, + { + "epoch": 2.081796209133698, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8215863704681396, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8760042190551758, + "num_tokens": 624243542.0, + "step": 16365 + }, + { + "epoch": 2.0819234194122886, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6569770574569702, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8815472722053528, + "num_tokens": 624278008.0, + "step": 16366 + }, + { + "epoch": 2.082050629690879, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6041992902755737, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8869452476501465, + "num_tokens": 624313681.0, + "step": 16367 + }, + { + "epoch": 2.0821778399694697, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6761599779129028, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8759130239486694, + "num_tokens": 624348760.0, + "step": 16368 + }, + { + "epoch": 2.0823050502480602, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.638766884803772, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8676335215568542, + "num_tokens": 624389778.0, + "step": 16369 + }, + { + "epoch": 2.0824322605266508, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7853504419326782, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8784534931182861, + "num_tokens": 624426607.0, + "step": 16370 + }, + { + "epoch": 2.082559470805241, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5308116674423218, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.882797360420227, + "num_tokens": 624469842.0, + "step": 16371 + }, + { + "epoch": 2.0826866810838314, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6064831018447876, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8623618483543396, + "num_tokens": 624513822.0, + "step": 16372 + }, + { + "epoch": 2.082813891362422, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6471971273422241, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8713595867156982, + "num_tokens": 624552795.0, + "step": 16373 + }, + { + "epoch": 2.0829411016410124, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.4116075038909912, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8785576820373535, + "num_tokens": 624604904.0, + "step": 16374 + }, + { + "epoch": 2.083068311919603, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6558037996292114, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.883597731590271, + "num_tokens": 624639162.0, + "step": 16375 + }, + { + "epoch": 2.0831955221981935, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7566847801208496, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8743794560432434, + "num_tokens": 624670874.0, + "step": 16376 + }, + { + "epoch": 2.083322732476784, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.577096700668335, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8724384307861328, + "num_tokens": 624713709.0, + "step": 16377 + }, + { + "epoch": 2.0834499427553745, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 2.276869773864746, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8730577230453491, + "num_tokens": 624749893.0, + "step": 16378 + }, + { + "epoch": 2.083577153033965, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6194185018539429, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8680203557014465, + "num_tokens": 624793455.0, + "step": 16379 + }, + { + "epoch": 2.0837043633125556, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.577960729598999, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8826174736022949, + "num_tokens": 624831515.0, + "step": 16380 + }, + { + "epoch": 2.083831573591146, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6030064821243286, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8691927194595337, + "num_tokens": 624871562.0, + "step": 16381 + }, + { + "epoch": 2.0839587838697367, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7770825624465942, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8782941102981567, + "num_tokens": 624905198.0, + "step": 16382 + }, + { + "epoch": 2.084085994148327, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.824829339981079, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8615741729736328, + "num_tokens": 624938367.0, + "step": 16383 + }, + { + "epoch": 2.0842132044269177, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5685921907424927, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8828266859054565, + "num_tokens": 624979576.0, + "step": 16384 + }, + { + "epoch": 2.0843404147055082, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.805410385131836, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8716098070144653, + "num_tokens": 625015437.0, + "step": 16385 + }, + { + "epoch": 2.0844676249840988, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7278084754943848, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8684103488922119, + "num_tokens": 625053189.0, + "step": 16386 + }, + { + "epoch": 2.0845948352626893, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6632473468780518, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8661268949508667, + "num_tokens": 625092401.0, + "step": 16387 + }, + { + "epoch": 2.08472204554128, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.776260495185852, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8717447519302368, + "num_tokens": 625131616.0, + "step": 16388 + }, + { + "epoch": 2.0848492558198704, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.597245693206787, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.866905152797699, + "num_tokens": 625174274.0, + "step": 16389 + }, + { + "epoch": 2.084976466098461, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7212893962860107, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8610807657241821, + "num_tokens": 625210449.0, + "step": 16390 + }, + { + "epoch": 2.0851036763770514, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.650370478630066, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.8924572467803955, + "num_tokens": 625246762.0, + "step": 16391 + }, + { + "epoch": 2.085230886655642, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6927390098571777, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.870789647102356, + "num_tokens": 625287017.0, + "step": 16392 + }, + { + "epoch": 2.0853580969342325, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6819264888763428, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8870791792869568, + "num_tokens": 625320500.0, + "step": 16393 + }, + { + "epoch": 2.085485307212823, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5969780683517456, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.868618905544281, + "num_tokens": 625360543.0, + "step": 16394 + }, + { + "epoch": 2.0856125174914135, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.638626217842102, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8694965839385986, + "num_tokens": 625401350.0, + "step": 16395 + }, + { + "epoch": 2.0857397277700036, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.683628797531128, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8777778744697571, + "num_tokens": 625437577.0, + "step": 16396 + }, + { + "epoch": 2.085866938048594, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6721827983856201, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8803306818008423, + "num_tokens": 625470208.0, + "step": 16397 + }, + { + "epoch": 2.0859941483271847, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7156535387039185, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8697495460510254, + "num_tokens": 625508472.0, + "step": 16398 + }, + { + "epoch": 2.086121358605775, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6302030086517334, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8856101036071777, + "num_tokens": 625544527.0, + "step": 16399 + }, + { + "epoch": 2.0862485688843657, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6688034534454346, + "learning_rate": 1e-06, + "loss": 0.3157, + "mean_token_accuracy": 0.8941048383712769, + "num_tokens": 625575389.0, + "step": 16400 + }, + { + "epoch": 2.0863757791629562, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.747139811515808, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8569442629814148, + "num_tokens": 625609673.0, + "step": 16401 + }, + { + "epoch": 2.0865029894415468, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.614186406135559, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8703440427780151, + "num_tokens": 625651323.0, + "step": 16402 + }, + { + "epoch": 2.0866301997201373, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6751716136932373, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.867964506149292, + "num_tokens": 625690554.0, + "step": 16403 + }, + { + "epoch": 2.086757409998728, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 2.206890106201172, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8937655687332153, + "num_tokens": 625728909.0, + "step": 16404 + }, + { + "epoch": 2.0868846202773184, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.560057520866394, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8845911026000977, + "num_tokens": 625768486.0, + "step": 16405 + }, + { + "epoch": 2.087011830555909, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6598172187805176, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8695002794265747, + "num_tokens": 625806756.0, + "step": 16406 + }, + { + "epoch": 2.0871390408344994, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7416962385177612, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8731658458709717, + "num_tokens": 625844650.0, + "step": 16407 + }, + { + "epoch": 2.08726625111309, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5549532175064087, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8827877044677734, + "num_tokens": 625885551.0, + "step": 16408 + }, + { + "epoch": 2.0873934613916805, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5860848426818848, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8743609189987183, + "num_tokens": 625924603.0, + "step": 16409 + }, + { + "epoch": 2.087520671670271, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6206787824630737, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8669952750205994, + "num_tokens": 625966120.0, + "step": 16410 + }, + { + "epoch": 2.0876478819488615, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6075332164764404, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8653213977813721, + "num_tokens": 626005642.0, + "step": 16411 + }, + { + "epoch": 2.087775092227452, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.673379898071289, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8721035718917847, + "num_tokens": 626043695.0, + "step": 16412 + }, + { + "epoch": 2.0879023025060426, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.642259955406189, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8772639036178589, + "num_tokens": 626086226.0, + "step": 16413 + }, + { + "epoch": 2.088029512784633, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.793468713760376, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8720415234565735, + "num_tokens": 626119903.0, + "step": 16414 + }, + { + "epoch": 2.0881567230632236, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8036516904830933, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8751962184906006, + "num_tokens": 626151812.0, + "step": 16415 + }, + { + "epoch": 2.088283933341814, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7839972972869873, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8535844087600708, + "num_tokens": 626195083.0, + "step": 16416 + }, + { + "epoch": 2.0884111436204047, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7674481868743896, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8832312822341919, + "num_tokens": 626229581.0, + "step": 16417 + }, + { + "epoch": 2.0885383538989952, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7205771207809448, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8672235012054443, + "num_tokens": 626266100.0, + "step": 16418 + }, + { + "epoch": 2.0886655641775858, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6338332891464233, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8634771704673767, + "num_tokens": 626308044.0, + "step": 16419 + }, + { + "epoch": 2.0887927744561763, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5157063007354736, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8855994343757629, + "num_tokens": 626351560.0, + "step": 16420 + }, + { + "epoch": 2.0889199847347664, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6368849277496338, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8652320504188538, + "num_tokens": 626396315.0, + "step": 16421 + }, + { + "epoch": 2.089047195013357, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.8166720867156982, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8595516681671143, + "num_tokens": 626434189.0, + "step": 16422 + }, + { + "epoch": 2.0891744052919474, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7223483324050903, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8624377250671387, + "num_tokens": 626469053.0, + "step": 16423 + }, + { + "epoch": 2.089301615570538, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.691986083984375, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8680145740509033, + "num_tokens": 626506265.0, + "step": 16424 + }, + { + "epoch": 2.0894288258491285, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.4399433135986328, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8793756365776062, + "num_tokens": 626553175.0, + "step": 16425 + }, + { + "epoch": 2.089556036127719, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6871802806854248, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8686398863792419, + "num_tokens": 626589104.0, + "step": 16426 + }, + { + "epoch": 2.0896832464063095, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6125880479812622, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8749990463256836, + "num_tokens": 626631288.0, + "step": 16427 + }, + { + "epoch": 2.0898104566849, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6451029777526855, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8875700235366821, + "num_tokens": 626668114.0, + "step": 16428 + }, + { + "epoch": 2.0899376669634906, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.626959204673767, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8793765902519226, + "num_tokens": 626706324.0, + "step": 16429 + }, + { + "epoch": 2.090064877242081, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.688248872756958, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8773934841156006, + "num_tokens": 626740882.0, + "step": 16430 + }, + { + "epoch": 2.0901920875206716, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.535697340965271, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.894133448600769, + "num_tokens": 626782993.0, + "step": 16431 + }, + { + "epoch": 2.090319297799262, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7095280885696411, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8681091666221619, + "num_tokens": 626820553.0, + "step": 16432 + }, + { + "epoch": 2.0904465080778527, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.561505913734436, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.882096529006958, + "num_tokens": 626859853.0, + "step": 16433 + }, + { + "epoch": 2.0905737183564432, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7769615650177002, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8478264808654785, + "num_tokens": 626900263.0, + "step": 16434 + }, + { + "epoch": 2.0907009286350338, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.7619614601135254, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8714582920074463, + "num_tokens": 626933720.0, + "step": 16435 + }, + { + "epoch": 2.0908281389136243, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6868842840194702, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8797881007194519, + "num_tokens": 626972066.0, + "step": 16436 + }, + { + "epoch": 2.090955349192215, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5335348844528198, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8770262598991394, + "num_tokens": 627015094.0, + "step": 16437 + }, + { + "epoch": 2.0910825594708053, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5029160976409912, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8896172642707825, + "num_tokens": 627055105.0, + "step": 16438 + }, + { + "epoch": 2.091209769749396, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.619315505027771, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8817986249923706, + "num_tokens": 627092915.0, + "step": 16439 + }, + { + "epoch": 2.0913369800279864, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.669622540473938, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8587032556533813, + "num_tokens": 627131345.0, + "step": 16440 + }, + { + "epoch": 2.091464190306577, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.5776045322418213, + "learning_rate": 1e-06, + "loss": 0.3188, + "mean_token_accuracy": 0.8950543999671936, + "num_tokens": 627168049.0, + "step": 16441 + }, + { + "epoch": 2.0915914005851675, + "ewc_loss": 2.6106834411621094e-05, + "grad_norm": 1.6792118549346924, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8702641129493713, + "num_tokens": 627206499.0, + "step": 16442 + }, + { + "epoch": 2.091718610863758, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.5695116519927979, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.892951250076294, + "num_tokens": 627243221.0, + "step": 16443 + }, + { + "epoch": 2.0918458211423485, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.5208351612091064, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8905264735221863, + "num_tokens": 627280960.0, + "step": 16444 + }, + { + "epoch": 2.0919730314209386, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.720342993736267, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8533241152763367, + "num_tokens": 627321854.0, + "step": 16445 + }, + { + "epoch": 2.092100241699529, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.5772939920425415, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8791207075119019, + "num_tokens": 627358340.0, + "step": 16446 + }, + { + "epoch": 2.0922274519781197, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.5822038650512695, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8778142929077148, + "num_tokens": 627397053.0, + "step": 16447 + }, + { + "epoch": 2.09235466225671, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.7917133569717407, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8664596080780029, + "num_tokens": 627433694.0, + "step": 16448 + }, + { + "epoch": 2.0924818725353007, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.716117262840271, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8626105785369873, + "num_tokens": 627473412.0, + "step": 16449 + }, + { + "epoch": 2.0926090828138912, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.665238857269287, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8651343584060669, + "num_tokens": 627512324.0, + "step": 16450 + }, + { + "epoch": 2.0927362930924818, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.6842409372329712, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8713799715042114, + "num_tokens": 627551478.0, + "step": 16451 + }, + { + "epoch": 2.0928635033710723, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.6399260759353638, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8904705047607422, + "num_tokens": 627589775.0, + "step": 16452 + }, + { + "epoch": 2.092990713649663, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.7604016065597534, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8636724948883057, + "num_tokens": 627628382.0, + "step": 16453 + }, + { + "epoch": 2.0931179239282534, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.6302058696746826, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8604003190994263, + "num_tokens": 627670244.0, + "step": 16454 + }, + { + "epoch": 2.093245134206844, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.6880121231079102, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8798689842224121, + "num_tokens": 627709293.0, + "step": 16455 + }, + { + "epoch": 2.0933723444854344, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 2.369607925415039, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8795746564865112, + "num_tokens": 627745556.0, + "step": 16456 + }, + { + "epoch": 2.093499554764025, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7349385023117065, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8646937608718872, + "num_tokens": 627785965.0, + "step": 16457 + }, + { + "epoch": 2.0936267650426155, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6493752002716064, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8860554695129395, + "num_tokens": 627823211.0, + "step": 16458 + }, + { + "epoch": 2.093753975321206, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6390535831451416, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8804497718811035, + "num_tokens": 627860368.0, + "step": 16459 + }, + { + "epoch": 2.0938811855997965, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7184169292449951, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8735830783843994, + "num_tokens": 627897595.0, + "step": 16460 + }, + { + "epoch": 2.094008395878387, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.6121429204940796, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8839762210845947, + "num_tokens": 627935957.0, + "step": 16461 + }, + { + "epoch": 2.0941356061569776, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.5512713193893433, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8683810234069824, + "num_tokens": 627978748.0, + "step": 16462 + }, + { + "epoch": 2.094262816435568, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.694622278213501, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8558200597763062, + "num_tokens": 628017658.0, + "step": 16463 + }, + { + "epoch": 2.0943900267141586, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6237435340881348, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8730992674827576, + "num_tokens": 628058764.0, + "step": 16464 + }, + { + "epoch": 2.094517236992749, + "ewc_loss": 2.6226043701171875e-05, + "grad_norm": 1.690385341644287, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8804330229759216, + "num_tokens": 628093136.0, + "step": 16465 + }, + { + "epoch": 2.0946444472713397, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.72905695438385, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8683667182922363, + "num_tokens": 628131559.0, + "step": 16466 + }, + { + "epoch": 2.09477165754993, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5673996210098267, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8876499533653259, + "num_tokens": 628171627.0, + "step": 16467 + }, + { + "epoch": 2.0948988678285207, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6372523307800293, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8731066584587097, + "num_tokens": 628210599.0, + "step": 16468 + }, + { + "epoch": 2.095026078107111, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6301136016845703, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.876894474029541, + "num_tokens": 628247709.0, + "step": 16469 + }, + { + "epoch": 2.0951532883857014, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.64032781124115, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8749924898147583, + "num_tokens": 628285482.0, + "step": 16470 + }, + { + "epoch": 2.095280498664292, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8754934072494507, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8590585589408875, + "num_tokens": 628314250.0, + "step": 16471 + }, + { + "epoch": 2.0954077089428824, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6707855463027954, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8669649362564087, + "num_tokens": 628354581.0, + "step": 16472 + }, + { + "epoch": 2.095534919221473, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6908984184265137, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8740876913070679, + "num_tokens": 628388434.0, + "step": 16473 + }, + { + "epoch": 2.0956621295000635, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6003198623657227, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8690775036811829, + "num_tokens": 628429158.0, + "step": 16474 + }, + { + "epoch": 2.095789339778654, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.749193549156189, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8692830204963684, + "num_tokens": 628461749.0, + "step": 16475 + }, + { + "epoch": 2.0959165500572445, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6562073230743408, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8822305202484131, + "num_tokens": 628501214.0, + "step": 16476 + }, + { + "epoch": 2.096043760335835, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7437084913253784, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8816443085670471, + "num_tokens": 628532870.0, + "step": 16477 + }, + { + "epoch": 2.0961709706144256, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6535183191299438, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8893031477928162, + "num_tokens": 628571011.0, + "step": 16478 + }, + { + "epoch": 2.096298180893016, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8033568859100342, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8716052770614624, + "num_tokens": 628604290.0, + "step": 16479 + }, + { + "epoch": 2.0964253911716066, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.580660343170166, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8808016180992126, + "num_tokens": 628649435.0, + "step": 16480 + }, + { + "epoch": 2.096552601450197, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.734608769416809, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8778148293495178, + "num_tokens": 628691239.0, + "step": 16481 + }, + { + "epoch": 2.0966798117287877, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7657548189163208, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8707706928253174, + "num_tokens": 628725001.0, + "step": 16482 + }, + { + "epoch": 2.0968070220073782, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6002800464630127, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8832235932350159, + "num_tokens": 628762955.0, + "step": 16483 + }, + { + "epoch": 2.0969342322859688, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7057666778564453, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8913013935089111, + "num_tokens": 628798972.0, + "step": 16484 + }, + { + "epoch": 2.0970614425645593, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6408828496932983, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8785796761512756, + "num_tokens": 628837099.0, + "step": 16485 + }, + { + "epoch": 2.09718865284315, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6099032163619995, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8814908266067505, + "num_tokens": 628877291.0, + "step": 16486 + }, + { + "epoch": 2.0973158631217403, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6336276531219482, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8850522041320801, + "num_tokens": 628916165.0, + "step": 16487 + }, + { + "epoch": 2.097443073400331, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5630801916122437, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8880081176757812, + "num_tokens": 628954701.0, + "step": 16488 + }, + { + "epoch": 2.0975702836789214, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.597031831741333, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8834800124168396, + "num_tokens": 628992893.0, + "step": 16489 + }, + { + "epoch": 2.097697493957512, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5826061964035034, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8695908188819885, + "num_tokens": 629034818.0, + "step": 16490 + }, + { + "epoch": 2.0978247042361025, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.4857970476150513, + "learning_rate": 1e-06, + "loss": 0.2987, + "mean_token_accuracy": 0.9020636677742004, + "num_tokens": 629072681.0, + "step": 16491 + }, + { + "epoch": 2.097951914514693, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8120023012161255, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8611037731170654, + "num_tokens": 629109598.0, + "step": 16492 + }, + { + "epoch": 2.0980791247932835, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7398498058319092, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8837997317314148, + "num_tokens": 629148961.0, + "step": 16493 + }, + { + "epoch": 2.0982063350718736, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5822386741638184, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8695254325866699, + "num_tokens": 629194405.0, + "step": 16494 + }, + { + "epoch": 2.098333545350464, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6461776494979858, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8826448321342468, + "num_tokens": 629230755.0, + "step": 16495 + }, + { + "epoch": 2.0984607556290547, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.69473135471344, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8673500418663025, + "num_tokens": 629269145.0, + "step": 16496 + }, + { + "epoch": 2.098587965907645, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5545134544372559, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8747261762619019, + "num_tokens": 629311252.0, + "step": 16497 + }, + { + "epoch": 2.0987151761862357, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5900219678878784, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8609887957572937, + "num_tokens": 629353590.0, + "step": 16498 + }, + { + "epoch": 2.0988423864648262, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5649362802505493, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8798559308052063, + "num_tokens": 629393641.0, + "step": 16499 + }, + { + "epoch": 2.0989695967434168, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6880098581314087, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8648335933685303, + "num_tokens": 629432438.0, + "step": 16500 + }, + { + "epoch": 2.0990968070220073, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6224021911621094, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8807912468910217, + "num_tokens": 629475919.0, + "step": 16501 + }, + { + "epoch": 2.099224017300598, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.633784532546997, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8741111755371094, + "num_tokens": 629512261.0, + "step": 16502 + }, + { + "epoch": 2.0993512275791884, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.657910943031311, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8559125661849976, + "num_tokens": 629551607.0, + "step": 16503 + }, + { + "epoch": 2.099478437857779, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5333855152130127, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8744596242904663, + "num_tokens": 629593006.0, + "step": 16504 + }, + { + "epoch": 2.0996056481363694, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.791128158569336, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8712677955627441, + "num_tokens": 629631110.0, + "step": 16505 + }, + { + "epoch": 2.09973285841496, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5743076801300049, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.870885968208313, + "num_tokens": 629672627.0, + "step": 16506 + }, + { + "epoch": 2.0998600686935505, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5604963302612305, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8781916499137878, + "num_tokens": 629715093.0, + "step": 16507 + }, + { + "epoch": 2.099987278972141, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5475945472717285, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8856385946273804, + "num_tokens": 629753789.0, + "step": 16508 + }, + { + "epoch": 2.1001144892507315, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5714805126190186, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8686547875404358, + "num_tokens": 629797753.0, + "step": 16509 + }, + { + "epoch": 2.100241699529322, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5892404317855835, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8910977840423584, + "num_tokens": 629836147.0, + "step": 16510 + }, + { + "epoch": 2.1003689098079126, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7397825717926025, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.876684844493866, + "num_tokens": 629869002.0, + "step": 16511 + }, + { + "epoch": 2.100496120086503, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6330095529556274, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8683958649635315, + "num_tokens": 629908653.0, + "step": 16512 + }, + { + "epoch": 2.1006233303650936, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6396472454071045, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8791003227233887, + "num_tokens": 629944279.0, + "step": 16513 + }, + { + "epoch": 2.100750540643684, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7220849990844727, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8539652824401855, + "num_tokens": 629982396.0, + "step": 16514 + }, + { + "epoch": 2.1008777509222747, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.4762916564941406, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8774300813674927, + "num_tokens": 630026687.0, + "step": 16515 + }, + { + "epoch": 2.101004961200865, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6236557960510254, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8753988742828369, + "num_tokens": 630064176.0, + "step": 16516 + }, + { + "epoch": 2.1011321714794557, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6198052167892456, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8709571957588196, + "num_tokens": 630101179.0, + "step": 16517 + }, + { + "epoch": 2.1012593817580463, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7018179893493652, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8798866271972656, + "num_tokens": 630134913.0, + "step": 16518 + }, + { + "epoch": 2.1013865920366364, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6577036380767822, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8898199796676636, + "num_tokens": 630170026.0, + "step": 16519 + }, + { + "epoch": 2.101513802315227, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7137370109558105, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8781974911689758, + "num_tokens": 630204926.0, + "step": 16520 + }, + { + "epoch": 2.1016410125938174, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6635340452194214, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8743600845336914, + "num_tokens": 630240910.0, + "step": 16521 + }, + { + "epoch": 2.101768222872408, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5851904153823853, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8774006366729736, + "num_tokens": 630278633.0, + "step": 16522 + }, + { + "epoch": 2.1018954331509985, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7717217206954956, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8792010545730591, + "num_tokens": 630324421.0, + "step": 16523 + }, + { + "epoch": 2.102022643429589, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5357413291931152, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8813817501068115, + "num_tokens": 630367814.0, + "step": 16524 + }, + { + "epoch": 2.1021498537081795, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6798070669174194, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.867332935333252, + "num_tokens": 630407191.0, + "step": 16525 + }, + { + "epoch": 2.10227706398677, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8165831565856934, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8451706767082214, + "num_tokens": 630442997.0, + "step": 16526 + }, + { + "epoch": 2.1024042742653606, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6972172260284424, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8816037178039551, + "num_tokens": 630479682.0, + "step": 16527 + }, + { + "epoch": 2.102531484543951, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.624537467956543, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8654111623764038, + "num_tokens": 630520047.0, + "step": 16528 + }, + { + "epoch": 2.1026586948225416, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.623358964920044, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8709267377853394, + "num_tokens": 630557097.0, + "step": 16529 + }, + { + "epoch": 2.102785905101132, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5261626243591309, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.882072925567627, + "num_tokens": 630600024.0, + "step": 16530 + }, + { + "epoch": 2.1029131153797227, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7603720426559448, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8668922185897827, + "num_tokens": 630635414.0, + "step": 16531 + }, + { + "epoch": 2.1030403256583132, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5425491333007812, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8760552406311035, + "num_tokens": 630678765.0, + "step": 16532 + }, + { + "epoch": 2.1031675359369038, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5351704359054565, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8888840079307556, + "num_tokens": 630718225.0, + "step": 16533 + }, + { + "epoch": 2.1032947462154943, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.659896969795227, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.872488260269165, + "num_tokens": 630754677.0, + "step": 16534 + }, + { + "epoch": 2.103421956494085, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.708616852760315, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8852066993713379, + "num_tokens": 630785796.0, + "step": 16535 + }, + { + "epoch": 2.1035491667726753, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5283275842666626, + "learning_rate": 1e-06, + "loss": 0.3094, + "mean_token_accuracy": 0.900432825088501, + "num_tokens": 630824375.0, + "step": 16536 + }, + { + "epoch": 2.103676377051266, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.667673110961914, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8688317537307739, + "num_tokens": 630862980.0, + "step": 16537 + }, + { + "epoch": 2.1038035873298564, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6944446563720703, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8889561295509338, + "num_tokens": 630895935.0, + "step": 16538 + }, + { + "epoch": 2.103930797608447, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7660325765609741, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8643361926078796, + "num_tokens": 630938596.0, + "step": 16539 + }, + { + "epoch": 2.1040580078870375, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.548704743385315, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8774912357330322, + "num_tokens": 630981716.0, + "step": 16540 + }, + { + "epoch": 2.104185218165628, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8740347623825073, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8878532648086548, + "num_tokens": 631011825.0, + "step": 16541 + }, + { + "epoch": 2.1043124284442185, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6483170986175537, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8753740787506104, + "num_tokens": 631051342.0, + "step": 16542 + }, + { + "epoch": 2.1044396387228086, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5967779159545898, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8854526877403259, + "num_tokens": 631089653.0, + "step": 16543 + }, + { + "epoch": 2.104566849001399, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8071855306625366, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8760897517204285, + "num_tokens": 631128064.0, + "step": 16544 + }, + { + "epoch": 2.1046940592799896, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7147996425628662, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8797174692153931, + "num_tokens": 631168106.0, + "step": 16545 + }, + { + "epoch": 2.10482126955858, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6022453308105469, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8568576574325562, + "num_tokens": 631213732.0, + "step": 16546 + }, + { + "epoch": 2.1049484798371707, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6541945934295654, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8525251150131226, + "num_tokens": 631256882.0, + "step": 16547 + }, + { + "epoch": 2.1050756901157612, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5748207569122314, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8823069334030151, + "num_tokens": 631297373.0, + "step": 16548 + }, + { + "epoch": 2.1052029003943518, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.803102731704712, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8886148929595947, + "num_tokens": 631331434.0, + "step": 16549 + }, + { + "epoch": 2.1053301106729423, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6009116172790527, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8727673888206482, + "num_tokens": 631375596.0, + "step": 16550 + }, + { + "epoch": 2.105457320951533, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7534167766571045, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8805992603302002, + "num_tokens": 631409989.0, + "step": 16551 + }, + { + "epoch": 2.1055845312301233, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6891125440597534, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8777115345001221, + "num_tokens": 631445396.0, + "step": 16552 + }, + { + "epoch": 2.105711741508714, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5529872179031372, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.880861222743988, + "num_tokens": 631487713.0, + "step": 16553 + }, + { + "epoch": 2.1058389517873044, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8457984924316406, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8747477531433105, + "num_tokens": 631522198.0, + "step": 16554 + }, + { + "epoch": 2.105966162065895, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7639338970184326, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8661085367202759, + "num_tokens": 631563399.0, + "step": 16555 + }, + { + "epoch": 2.1060933723444855, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.621206283569336, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8735694885253906, + "num_tokens": 631599886.0, + "step": 16556 + }, + { + "epoch": 2.106220582623076, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8238459825515747, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.875917911529541, + "num_tokens": 631635892.0, + "step": 16557 + }, + { + "epoch": 2.1063477929016665, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7479239702224731, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8842579126358032, + "num_tokens": 631670324.0, + "step": 16558 + }, + { + "epoch": 2.106475003180257, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6469248533248901, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8826805353164673, + "num_tokens": 631705316.0, + "step": 16559 + }, + { + "epoch": 2.1066022134588476, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6496281623840332, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.882750391960144, + "num_tokens": 631740845.0, + "step": 16560 + }, + { + "epoch": 2.106729423737438, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.93600332736969, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8726327419281006, + "num_tokens": 631770490.0, + "step": 16561 + }, + { + "epoch": 2.1068566340160286, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.607124924659729, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8778760433197021, + "num_tokens": 631811806.0, + "step": 16562 + }, + { + "epoch": 2.106983844294619, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8057156801223755, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8476362228393555, + "num_tokens": 631849625.0, + "step": 16563 + }, + { + "epoch": 2.1071110545732097, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8135902881622314, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8703416585922241, + "num_tokens": 631884302.0, + "step": 16564 + }, + { + "epoch": 2.1072382648518, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6985151767730713, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8810256123542786, + "num_tokens": 631922676.0, + "step": 16565 + }, + { + "epoch": 2.1073654751303907, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6237190961837769, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8790923953056335, + "num_tokens": 631961257.0, + "step": 16566 + }, + { + "epoch": 2.107492685408981, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.9200950860977173, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8750470876693726, + "num_tokens": 631996218.0, + "step": 16567 + }, + { + "epoch": 2.1076198956875714, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6004891395568848, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.877125084400177, + "num_tokens": 632033802.0, + "step": 16568 + }, + { + "epoch": 2.107747105966162, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5826953649520874, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8729265928268433, + "num_tokens": 632075666.0, + "step": 16569 + }, + { + "epoch": 2.1078743162447524, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6901473999023438, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8739129304885864, + "num_tokens": 632114389.0, + "step": 16570 + }, + { + "epoch": 2.108001526523343, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7585216760635376, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.881178617477417, + "num_tokens": 632146182.0, + "step": 16571 + }, + { + "epoch": 2.1081287368019335, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7175073623657227, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8792768716812134, + "num_tokens": 632187718.0, + "step": 16572 + }, + { + "epoch": 2.108255947080524, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6686468124389648, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8849529027938843, + "num_tokens": 632228697.0, + "step": 16573 + }, + { + "epoch": 2.1083831573591145, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7623211145401, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8698185682296753, + "num_tokens": 632264622.0, + "step": 16574 + }, + { + "epoch": 2.108510367637705, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6968042850494385, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8741348385810852, + "num_tokens": 632302621.0, + "step": 16575 + }, + { + "epoch": 2.1086375779162956, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5027806758880615, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8824632167816162, + "num_tokens": 632344022.0, + "step": 16576 + }, + { + "epoch": 2.108764788194886, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6750760078430176, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8683087229728699, + "num_tokens": 632385274.0, + "step": 16577 + }, + { + "epoch": 2.1088919984734766, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7057157754898071, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8770815134048462, + "num_tokens": 632422716.0, + "step": 16578 + }, + { + "epoch": 2.109019208752067, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7323898077011108, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8551720976829529, + "num_tokens": 632461251.0, + "step": 16579 + }, + { + "epoch": 2.1091464190306577, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7053302526474, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8723889589309692, + "num_tokens": 632497367.0, + "step": 16580 + }, + { + "epoch": 2.109273629309248, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6818864345550537, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8813002109527588, + "num_tokens": 632532301.0, + "step": 16581 + }, + { + "epoch": 2.1094008395878387, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7261497974395752, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8593466281890869, + "num_tokens": 632568698.0, + "step": 16582 + }, + { + "epoch": 2.1095280498664293, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6650941371917725, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8845581412315369, + "num_tokens": 632606049.0, + "step": 16583 + }, + { + "epoch": 2.10965526014502, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5837680101394653, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8805065155029297, + "num_tokens": 632646974.0, + "step": 16584 + }, + { + "epoch": 2.1097824704236103, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8866955041885376, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8675791025161743, + "num_tokens": 632678790.0, + "step": 16585 + }, + { + "epoch": 2.109909680702201, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 2.012888193130493, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8714999556541443, + "num_tokens": 632711189.0, + "step": 16586 + }, + { + "epoch": 2.1100368909807914, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6107759475708008, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8743481636047363, + "num_tokens": 632752875.0, + "step": 16587 + }, + { + "epoch": 2.110164101259382, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8131529092788696, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8632827997207642, + "num_tokens": 632794189.0, + "step": 16588 + }, + { + "epoch": 2.1102913115379724, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5175151824951172, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8861795663833618, + "num_tokens": 632833741.0, + "step": 16589 + }, + { + "epoch": 2.110418521816563, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.669967770576477, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8671284914016724, + "num_tokens": 632872752.0, + "step": 16590 + }, + { + "epoch": 2.1105457320951535, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8512237071990967, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8738687634468079, + "num_tokens": 632905810.0, + "step": 16591 + }, + { + "epoch": 2.1106729423737436, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7394533157348633, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.88068687915802, + "num_tokens": 632949074.0, + "step": 16592 + }, + { + "epoch": 2.110800152652334, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5616154670715332, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8804863691329956, + "num_tokens": 632989879.0, + "step": 16593 + }, + { + "epoch": 2.1109273629309246, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.643958330154419, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8818710446357727, + "num_tokens": 633025309.0, + "step": 16594 + }, + { + "epoch": 2.111054573209515, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5881632566452026, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8836385011672974, + "num_tokens": 633063598.0, + "step": 16595 + }, + { + "epoch": 2.1111817834881057, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.5031027793884277, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8962208032608032, + "num_tokens": 633104177.0, + "step": 16596 + }, + { + "epoch": 2.1113089937666962, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6824169158935547, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8720244765281677, + "num_tokens": 633145532.0, + "step": 16597 + }, + { + "epoch": 2.1114362040452868, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6921758651733398, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8769007921218872, + "num_tokens": 633183484.0, + "step": 16598 + }, + { + "epoch": 2.1115634143238773, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6157363653182983, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8781472444534302, + "num_tokens": 633219917.0, + "step": 16599 + }, + { + "epoch": 2.111690624602468, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6888725757598877, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8800180554389954, + "num_tokens": 633251426.0, + "step": 16600 + }, + { + "epoch": 2.1118178348810583, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6662590503692627, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8813024163246155, + "num_tokens": 633288496.0, + "step": 16601 + }, + { + "epoch": 2.111945045159649, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7705498933792114, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8729353547096252, + "num_tokens": 633324537.0, + "step": 16602 + }, + { + "epoch": 2.1120722554382394, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.8017395734786987, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8584616184234619, + "num_tokens": 633361923.0, + "step": 16603 + }, + { + "epoch": 2.11219946571683, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6553694009780884, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8808374404907227, + "num_tokens": 633400366.0, + "step": 16604 + }, + { + "epoch": 2.1123266759954205, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6841514110565186, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.868212878704071, + "num_tokens": 633438879.0, + "step": 16605 + }, + { + "epoch": 2.112453886274011, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7089321613311768, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8772995471954346, + "num_tokens": 633476348.0, + "step": 16606 + }, + { + "epoch": 2.1125810965526015, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7311537265777588, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8692837357521057, + "num_tokens": 633513558.0, + "step": 16607 + }, + { + "epoch": 2.112708306831192, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.640418291091919, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8685714602470398, + "num_tokens": 633559344.0, + "step": 16608 + }, + { + "epoch": 2.1128355171097826, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6975479125976562, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8801889419555664, + "num_tokens": 633593798.0, + "step": 16609 + }, + { + "epoch": 2.112962727388373, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6177469491958618, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8858739137649536, + "num_tokens": 633628912.0, + "step": 16610 + }, + { + "epoch": 2.1130899376669636, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.697731614112854, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8768558502197266, + "num_tokens": 633665925.0, + "step": 16611 + }, + { + "epoch": 2.113217147945554, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.7388781309127808, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8602854013442993, + "num_tokens": 633702177.0, + "step": 16612 + }, + { + "epoch": 2.1133443582241447, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6934027671813965, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8785735964775085, + "num_tokens": 633737143.0, + "step": 16613 + }, + { + "epoch": 2.113471568502735, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.630246877670288, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8743562698364258, + "num_tokens": 633776366.0, + "step": 16614 + }, + { + "epoch": 2.1135987787813257, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6794915199279785, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8848943710327148, + "num_tokens": 633809486.0, + "step": 16615 + }, + { + "epoch": 2.1137259890599163, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7436835765838623, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8815248012542725, + "num_tokens": 633842133.0, + "step": 16616 + }, + { + "epoch": 2.1138531993385064, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.698829174041748, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8753790855407715, + "num_tokens": 633879459.0, + "step": 16617 + }, + { + "epoch": 2.113980409617097, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5701779127120972, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8709269762039185, + "num_tokens": 633924382.0, + "step": 16618 + }, + { + "epoch": 2.1141076198956874, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5520199537277222, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8755887746810913, + "num_tokens": 633964210.0, + "step": 16619 + }, + { + "epoch": 2.114234830174278, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7089736461639404, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8741135597229004, + "num_tokens": 634000388.0, + "step": 16620 + }, + { + "epoch": 2.1143620404528685, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6280114650726318, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8755897283554077, + "num_tokens": 634038312.0, + "step": 16621 + }, + { + "epoch": 2.114489250731459, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5862352848052979, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8868349194526672, + "num_tokens": 634075429.0, + "step": 16622 + }, + { + "epoch": 2.1146164610100495, + "ewc_loss": 2.6345252990722656e-05, + "grad_norm": 1.6706043481826782, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8743319511413574, + "num_tokens": 634112522.0, + "step": 16623 + }, + { + "epoch": 2.11474367128864, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5681713819503784, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8740575909614563, + "num_tokens": 634153352.0, + "step": 16624 + }, + { + "epoch": 2.1148708815672306, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.639073133468628, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8796587586402893, + "num_tokens": 634188624.0, + "step": 16625 + }, + { + "epoch": 2.114998091845821, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5703041553497314, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8719489574432373, + "num_tokens": 634227990.0, + "step": 16626 + }, + { + "epoch": 2.1151253021244116, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6150916814804077, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8798389434814453, + "num_tokens": 634264750.0, + "step": 16627 + }, + { + "epoch": 2.115252512403002, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5253106355667114, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8657526969909668, + "num_tokens": 634309428.0, + "step": 16628 + }, + { + "epoch": 2.1153797226815927, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6030497550964355, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8856368660926819, + "num_tokens": 634347402.0, + "step": 16629 + }, + { + "epoch": 2.115506932960183, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.655185580253601, + "learning_rate": 1e-06, + "loss": 0.3278, + "mean_token_accuracy": 0.8928972482681274, + "num_tokens": 634379353.0, + "step": 16630 + }, + { + "epoch": 2.1156341432387737, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5867602825164795, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8812680244445801, + "num_tokens": 634420682.0, + "step": 16631 + }, + { + "epoch": 2.1157613535173643, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6384092569351196, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8701496720314026, + "num_tokens": 634461334.0, + "step": 16632 + }, + { + "epoch": 2.115888563795955, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.583910584449768, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8727589845657349, + "num_tokens": 634502615.0, + "step": 16633 + }, + { + "epoch": 2.1160157740745453, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6573810577392578, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8708096742630005, + "num_tokens": 634542788.0, + "step": 16634 + }, + { + "epoch": 2.116142984353136, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5670188665390015, + "learning_rate": 1e-06, + "loss": 0.3104, + "mean_token_accuracy": 0.8992967009544373, + "num_tokens": 634582847.0, + "step": 16635 + }, + { + "epoch": 2.1162701946317264, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6076550483703613, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8761544227600098, + "num_tokens": 634622783.0, + "step": 16636 + }, + { + "epoch": 2.116397404910317, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7304719686508179, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8796133995056152, + "num_tokens": 634660217.0, + "step": 16637 + }, + { + "epoch": 2.1165246151889074, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.9358134269714355, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8779618144035339, + "num_tokens": 634692722.0, + "step": 16638 + }, + { + "epoch": 2.116651825467498, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6730289459228516, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.854933500289917, + "num_tokens": 634732524.0, + "step": 16639 + }, + { + "epoch": 2.116779035746088, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6503804922103882, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8611234426498413, + "num_tokens": 634774470.0, + "step": 16640 + }, + { + "epoch": 2.1169062460246786, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6083221435546875, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8781142234802246, + "num_tokens": 634812562.0, + "step": 16641 + }, + { + "epoch": 2.117033456303269, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6451400518417358, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8926750421524048, + "num_tokens": 634846507.0, + "step": 16642 + }, + { + "epoch": 2.1171606665818596, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6653040647506714, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8661038279533386, + "num_tokens": 634884180.0, + "step": 16643 + }, + { + "epoch": 2.11728787686045, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7598001956939697, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8630789518356323, + "num_tokens": 634922172.0, + "step": 16644 + }, + { + "epoch": 2.1174150871390407, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7331079244613647, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8672740459442139, + "num_tokens": 634960310.0, + "step": 16645 + }, + { + "epoch": 2.1175422974176312, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7039117813110352, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8684087991714478, + "num_tokens": 634996115.0, + "step": 16646 + }, + { + "epoch": 2.1176695076962218, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6243290901184082, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8703563213348389, + "num_tokens": 635036892.0, + "step": 16647 + }, + { + "epoch": 2.1177967179748123, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 2.313027858734131, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8702941536903381, + "num_tokens": 635076495.0, + "step": 16648 + }, + { + "epoch": 2.117923928253403, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5829246044158936, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8802597522735596, + "num_tokens": 635115213.0, + "step": 16649 + }, + { + "epoch": 2.1180511385319933, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7733635902404785, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8781698942184448, + "num_tokens": 635146348.0, + "step": 16650 + }, + { + "epoch": 2.118178348810584, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.8014131784439087, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8622201085090637, + "num_tokens": 635180742.0, + "step": 16651 + }, + { + "epoch": 2.1183055590891744, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5521129369735718, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8840936422348022, + "num_tokens": 635220762.0, + "step": 16652 + }, + { + "epoch": 2.118432769367765, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6591272354125977, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8721232414245605, + "num_tokens": 635259048.0, + "step": 16653 + }, + { + "epoch": 2.1185599796463555, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.667514443397522, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8817883729934692, + "num_tokens": 635293667.0, + "step": 16654 + }, + { + "epoch": 2.118687189924946, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.4721367359161377, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8810948133468628, + "num_tokens": 635339117.0, + "step": 16655 + }, + { + "epoch": 2.1188144002035365, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.4771933555603027, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8836015462875366, + "num_tokens": 635379254.0, + "step": 16656 + }, + { + "epoch": 2.118941610482127, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.67781400680542, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8809232115745544, + "num_tokens": 635416595.0, + "step": 16657 + }, + { + "epoch": 2.1190688207607176, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.565110683441162, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8848392963409424, + "num_tokens": 635453722.0, + "step": 16658 + }, + { + "epoch": 2.119196031039308, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6581170558929443, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8700206279754639, + "num_tokens": 635492695.0, + "step": 16659 + }, + { + "epoch": 2.1193232413178986, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5980277061462402, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8749275207519531, + "num_tokens": 635532152.0, + "step": 16660 + }, + { + "epoch": 2.119450451596489, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5688180923461914, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8610426783561707, + "num_tokens": 635575181.0, + "step": 16661 + }, + { + "epoch": 2.1195776618750797, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.525553822517395, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8800414204597473, + "num_tokens": 635618042.0, + "step": 16662 + }, + { + "epoch": 2.11970487215367, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.669705867767334, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8809032440185547, + "num_tokens": 635653623.0, + "step": 16663 + }, + { + "epoch": 2.1198320824322607, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6933976411819458, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.870525062084198, + "num_tokens": 635690962.0, + "step": 16664 + }, + { + "epoch": 2.119959292710851, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6145426034927368, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8752827644348145, + "num_tokens": 635728695.0, + "step": 16665 + }, + { + "epoch": 2.1200865029894413, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7811368703842163, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8749171495437622, + "num_tokens": 635762148.0, + "step": 16666 + }, + { + "epoch": 2.120213713268032, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7293566465377808, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8719730377197266, + "num_tokens": 635797542.0, + "step": 16667 + }, + { + "epoch": 2.1203409235466224, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 16.708223342895508, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8760329484939575, + "num_tokens": 635833736.0, + "step": 16668 + }, + { + "epoch": 2.120468133825213, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6736369132995605, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.868082582950592, + "num_tokens": 635873378.0, + "step": 16669 + }, + { + "epoch": 2.1205953441038035, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7077734470367432, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8701070547103882, + "num_tokens": 635913422.0, + "step": 16670 + }, + { + "epoch": 2.120722554382394, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.789428949356079, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8681296706199646, + "num_tokens": 635946840.0, + "step": 16671 + }, + { + "epoch": 2.1208497646609845, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.63949453830719, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8785969018936157, + "num_tokens": 635988814.0, + "step": 16672 + }, + { + "epoch": 2.120976974939575, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5904488563537598, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8814624547958374, + "num_tokens": 636031524.0, + "step": 16673 + }, + { + "epoch": 2.1211041852181656, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7477742433547974, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.879060685634613, + "num_tokens": 636067107.0, + "step": 16674 + }, + { + "epoch": 2.121231395496756, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7438688278198242, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8732860088348389, + "num_tokens": 636103138.0, + "step": 16675 + }, + { + "epoch": 2.1213586057753466, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5913608074188232, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.891222357749939, + "num_tokens": 636143191.0, + "step": 16676 + }, + { + "epoch": 2.121485816053937, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6280227899551392, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8792411684989929, + "num_tokens": 636181894.0, + "step": 16677 + }, + { + "epoch": 2.1216130263325277, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.4887579679489136, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8694619536399841, + "num_tokens": 636227431.0, + "step": 16678 + }, + { + "epoch": 2.121740236611118, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5423128604888916, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8734872937202454, + "num_tokens": 636266227.0, + "step": 16679 + }, + { + "epoch": 2.1218674468897087, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5569640398025513, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8903287649154663, + "num_tokens": 636305831.0, + "step": 16680 + }, + { + "epoch": 2.1219946571682993, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6367727518081665, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8872382640838623, + "num_tokens": 636342982.0, + "step": 16681 + }, + { + "epoch": 2.12212186744689, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5787930488586426, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8838350176811218, + "num_tokens": 636384563.0, + "step": 16682 + }, + { + "epoch": 2.1222490777254803, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7709808349609375, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8810652494430542, + "num_tokens": 636419837.0, + "step": 16683 + }, + { + "epoch": 2.122376288004071, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.665172815322876, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8784932494163513, + "num_tokens": 636455431.0, + "step": 16684 + }, + { + "epoch": 2.1225034982826614, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.802870273590088, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8725754618644714, + "num_tokens": 636494345.0, + "step": 16685 + }, + { + "epoch": 2.122630708561252, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6550737619400024, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8685305118560791, + "num_tokens": 636536822.0, + "step": 16686 + }, + { + "epoch": 2.1227579188398424, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6194474697113037, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8786766529083252, + "num_tokens": 636575982.0, + "step": 16687 + }, + { + "epoch": 2.122885129118433, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7195115089416504, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8502063155174255, + "num_tokens": 636616645.0, + "step": 16688 + }, + { + "epoch": 2.1230123393970235, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6281518936157227, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8797029852867126, + "num_tokens": 636654698.0, + "step": 16689 + }, + { + "epoch": 2.1231395496756136, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7102081775665283, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8803787231445312, + "num_tokens": 636690530.0, + "step": 16690 + }, + { + "epoch": 2.123266759954204, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.585017442703247, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8720540404319763, + "num_tokens": 636733466.0, + "step": 16691 + }, + { + "epoch": 2.1233939702327946, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5683554410934448, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8788594007492065, + "num_tokens": 636772015.0, + "step": 16692 + }, + { + "epoch": 2.123521180511385, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7984325885772705, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8798388242721558, + "num_tokens": 636805579.0, + "step": 16693 + }, + { + "epoch": 2.1236483907899757, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5158296823501587, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8847362995147705, + "num_tokens": 636848342.0, + "step": 16694 + }, + { + "epoch": 2.123775601068566, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7120325565338135, + "learning_rate": 1e-06, + "loss": 0.3252, + "mean_token_accuracy": 0.8921328783035278, + "num_tokens": 636879806.0, + "step": 16695 + }, + { + "epoch": 2.1239028113471567, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6600555181503296, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8672576546669006, + "num_tokens": 636919364.0, + "step": 16696 + }, + { + "epoch": 2.1240300216257473, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6490627527236938, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8828904628753662, + "num_tokens": 636958320.0, + "step": 16697 + }, + { + "epoch": 2.124157231904338, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6588002443313599, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8778798580169678, + "num_tokens": 636995318.0, + "step": 16698 + }, + { + "epoch": 2.1242844421829283, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.60606050491333, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8841779232025146, + "num_tokens": 637036852.0, + "step": 16699 + }, + { + "epoch": 2.124411652461519, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.586843490600586, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8828924298286438, + "num_tokens": 637079472.0, + "step": 16700 + }, + { + "epoch": 2.1245388627401094, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5766557455062866, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8698253035545349, + "num_tokens": 637124372.0, + "step": 16701 + }, + { + "epoch": 2.1246660730187, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.8091212511062622, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8819746971130371, + "num_tokens": 637159979.0, + "step": 16702 + }, + { + "epoch": 2.1247932832972904, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6445893049240112, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.8885776400566101, + "num_tokens": 637200861.0, + "step": 16703 + }, + { + "epoch": 2.124920493575881, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5040521621704102, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8761477470397949, + "num_tokens": 637245386.0, + "step": 16704 + }, + { + "epoch": 2.1250477038544715, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.655849814414978, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8668420910835266, + "num_tokens": 637286544.0, + "step": 16705 + }, + { + "epoch": 2.125174914133062, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.741109848022461, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8509056568145752, + "num_tokens": 637323650.0, + "step": 16706 + }, + { + "epoch": 2.1253021244116526, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5811550617218018, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8805660009384155, + "num_tokens": 637361082.0, + "step": 16707 + }, + { + "epoch": 2.125429334690243, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6121782064437866, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.892598032951355, + "num_tokens": 637399341.0, + "step": 16708 + }, + { + "epoch": 2.1255565449688336, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6418843269348145, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8834545612335205, + "num_tokens": 637437623.0, + "step": 16709 + }, + { + "epoch": 2.125683755247424, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6027498245239258, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8768309950828552, + "num_tokens": 637478128.0, + "step": 16710 + }, + { + "epoch": 2.1258109655260147, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5545201301574707, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8827839493751526, + "num_tokens": 637517167.0, + "step": 16711 + }, + { + "epoch": 2.125938175804605, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.644408941268921, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8793925046920776, + "num_tokens": 637556051.0, + "step": 16712 + }, + { + "epoch": 2.1260653860831957, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5850695371627808, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8748974800109863, + "num_tokens": 637596990.0, + "step": 16713 + }, + { + "epoch": 2.1261925963617863, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6773676872253418, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8757362365722656, + "num_tokens": 637638825.0, + "step": 16714 + }, + { + "epoch": 2.1263198066403763, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.548170566558838, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8903731107711792, + "num_tokens": 637679559.0, + "step": 16715 + }, + { + "epoch": 2.126447016918967, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.8135401010513306, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.872636079788208, + "num_tokens": 637715825.0, + "step": 16716 + }, + { + "epoch": 2.1265742271975574, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.549837350845337, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8805916905403137, + "num_tokens": 637758413.0, + "step": 16717 + }, + { + "epoch": 2.126701437476148, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.8378753662109375, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8807406425476074, + "num_tokens": 637789703.0, + "step": 16718 + }, + { + "epoch": 2.1268286477547385, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7620853185653687, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8718063235282898, + "num_tokens": 637823166.0, + "step": 16719 + }, + { + "epoch": 2.126955858033329, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6508305072784424, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8723800182342529, + "num_tokens": 637861994.0, + "step": 16720 + }, + { + "epoch": 2.1270830683119195, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6672780513763428, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8810210824012756, + "num_tokens": 637898852.0, + "step": 16721 + }, + { + "epoch": 2.12721027859051, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.729933500289917, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8779945373535156, + "num_tokens": 637935067.0, + "step": 16722 + }, + { + "epoch": 2.1273374888691006, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.8233288526535034, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8732254505157471, + "num_tokens": 637969019.0, + "step": 16723 + }, + { + "epoch": 2.127464699147691, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5814387798309326, + "learning_rate": 1e-06, + "loss": 0.3137, + "mean_token_accuracy": 0.8974728584289551, + "num_tokens": 638005466.0, + "step": 16724 + }, + { + "epoch": 2.1275919094262816, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6018776893615723, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8889070749282837, + "num_tokens": 638044383.0, + "step": 16725 + }, + { + "epoch": 2.127719119704872, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.8797422647476196, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8722324967384338, + "num_tokens": 638078259.0, + "step": 16726 + }, + { + "epoch": 2.1278463299834627, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.719785213470459, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8821958899497986, + "num_tokens": 638112079.0, + "step": 16727 + }, + { + "epoch": 2.127973540262053, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5565062761306763, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8847018480300903, + "num_tokens": 638153023.0, + "step": 16728 + }, + { + "epoch": 2.1281007505406437, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7830053567886353, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8812123537063599, + "num_tokens": 638187685.0, + "step": 16729 + }, + { + "epoch": 2.1282279608192343, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6816370487213135, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8703434467315674, + "num_tokens": 638225629.0, + "step": 16730 + }, + { + "epoch": 2.128355171097825, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7482526302337646, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8705519437789917, + "num_tokens": 638264325.0, + "step": 16731 + }, + { + "epoch": 2.1284823813764153, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.751709222793579, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8518633842468262, + "num_tokens": 638304009.0, + "step": 16732 + }, + { + "epoch": 2.128609591655006, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6590057611465454, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8828796148300171, + "num_tokens": 638343359.0, + "step": 16733 + }, + { + "epoch": 2.1287368019335964, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6745026111602783, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8662177920341492, + "num_tokens": 638379537.0, + "step": 16734 + }, + { + "epoch": 2.128864012212187, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.670637845993042, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8574910163879395, + "num_tokens": 638419324.0, + "step": 16735 + }, + { + "epoch": 2.1289912224907774, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.636669397354126, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8758857846260071, + "num_tokens": 638455567.0, + "step": 16736 + }, + { + "epoch": 2.129118432769368, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7116869688034058, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8796125054359436, + "num_tokens": 638490410.0, + "step": 16737 + }, + { + "epoch": 2.129245643047958, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.8407992124557495, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8748319149017334, + "num_tokens": 638530597.0, + "step": 16738 + }, + { + "epoch": 2.129372853326549, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5785905122756958, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8778151273727417, + "num_tokens": 638574040.0, + "step": 16739 + }, + { + "epoch": 2.129500063605139, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5933985710144043, + "learning_rate": 1e-06, + "loss": 0.3315, + "mean_token_accuracy": 0.889266848564148, + "num_tokens": 638612227.0, + "step": 16740 + }, + { + "epoch": 2.1296272738837296, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7182979583740234, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8737995624542236, + "num_tokens": 638649885.0, + "step": 16741 + }, + { + "epoch": 2.12975448416232, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.59839928150177, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8870736360549927, + "num_tokens": 638686928.0, + "step": 16742 + }, + { + "epoch": 2.1298816944409107, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5988171100616455, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8748074769973755, + "num_tokens": 638726327.0, + "step": 16743 + }, + { + "epoch": 2.130008904719501, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5871093273162842, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8849717378616333, + "num_tokens": 638767662.0, + "step": 16744 + }, + { + "epoch": 2.1301361149980917, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.675832748413086, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8674919605255127, + "num_tokens": 638808981.0, + "step": 16745 + }, + { + "epoch": 2.1302633252766823, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6737743616104126, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8597809076309204, + "num_tokens": 638847708.0, + "step": 16746 + }, + { + "epoch": 2.130390535555273, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7021510601043701, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8843020796775818, + "num_tokens": 638884417.0, + "step": 16747 + }, + { + "epoch": 2.1305177458338633, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.665523886680603, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8826302289962769, + "num_tokens": 638920075.0, + "step": 16748 + }, + { + "epoch": 2.130644956112454, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.5329152345657349, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.887846827507019, + "num_tokens": 638959303.0, + "step": 16749 + }, + { + "epoch": 2.1307721663910444, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6084645986557007, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8740029335021973, + "num_tokens": 639000093.0, + "step": 16750 + }, + { + "epoch": 2.130899376669635, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6725443601608276, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8734476566314697, + "num_tokens": 639037799.0, + "step": 16751 + }, + { + "epoch": 2.1310265869482254, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7703360319137573, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8732749223709106, + "num_tokens": 639072019.0, + "step": 16752 + }, + { + "epoch": 2.131153797226816, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6360883712768555, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8736732006072998, + "num_tokens": 639111067.0, + "step": 16753 + }, + { + "epoch": 2.1312810075054065, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.622005820274353, + "learning_rate": 1e-06, + "loss": 0.3316, + "mean_token_accuracy": 0.8912348747253418, + "num_tokens": 639149363.0, + "step": 16754 + }, + { + "epoch": 2.131408217783997, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.798265814781189, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8819555640220642, + "num_tokens": 639184259.0, + "step": 16755 + }, + { + "epoch": 2.1315354280625876, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.6040476560592651, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8774007558822632, + "num_tokens": 639226589.0, + "step": 16756 + }, + { + "epoch": 2.131662638341178, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.7314118146896362, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8686985969543457, + "num_tokens": 639264539.0, + "step": 16757 + }, + { + "epoch": 2.1317898486197686, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6737140417099, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8758203387260437, + "num_tokens": 639299172.0, + "step": 16758 + }, + { + "epoch": 2.131917058898359, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.550430417060852, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8649278283119202, + "num_tokens": 639340401.0, + "step": 16759 + }, + { + "epoch": 2.1320442691769497, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7426310777664185, + "learning_rate": 1e-06, + "loss": 0.3338, + "mean_token_accuracy": 0.8912209272384644, + "num_tokens": 639384348.0, + "step": 16760 + }, + { + "epoch": 2.13217147945554, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.8167755603790283, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8600609302520752, + "num_tokens": 639423198.0, + "step": 16761 + }, + { + "epoch": 2.1322986897341307, + "ewc_loss": 2.6464462280273438e-05, + "grad_norm": 1.8588391542434692, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8742668628692627, + "num_tokens": 639454424.0, + "step": 16762 + }, + { + "epoch": 2.132425900012721, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7366303205490112, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8685293197631836, + "num_tokens": 639492487.0, + "step": 16763 + }, + { + "epoch": 2.1325531102913113, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.8297154903411865, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8755663633346558, + "num_tokens": 639526019.0, + "step": 16764 + }, + { + "epoch": 2.132680320569902, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7818357944488525, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8812137246131897, + "num_tokens": 639561027.0, + "step": 16765 + }, + { + "epoch": 2.1328075308484924, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6710240840911865, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8858945965766907, + "num_tokens": 639598174.0, + "step": 16766 + }, + { + "epoch": 2.132934741127083, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6806542873382568, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8801243305206299, + "num_tokens": 639638785.0, + "step": 16767 + }, + { + "epoch": 2.1330619514056735, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6739002466201782, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8911458849906921, + "num_tokens": 639676291.0, + "step": 16768 + }, + { + "epoch": 2.133189161684264, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5873956680297852, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.88593989610672, + "num_tokens": 639713911.0, + "step": 16769 + }, + { + "epoch": 2.1333163719628545, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.4754503965377808, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8772090673446655, + "num_tokens": 639757974.0, + "step": 16770 + }, + { + "epoch": 2.133443582241445, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6001754999160767, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8825187683105469, + "num_tokens": 639796217.0, + "step": 16771 + }, + { + "epoch": 2.1335707925200356, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6759223937988281, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8766330480575562, + "num_tokens": 639831458.0, + "step": 16772 + }, + { + "epoch": 2.133698002798626, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6163934469223022, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8847354650497437, + "num_tokens": 639867199.0, + "step": 16773 + }, + { + "epoch": 2.1338252130772166, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6265770196914673, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.856776773929596, + "num_tokens": 639913418.0, + "step": 16774 + }, + { + "epoch": 2.133952423355807, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6771442890167236, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8633089065551758, + "num_tokens": 639956567.0, + "step": 16775 + }, + { + "epoch": 2.1340796336343977, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6093193292617798, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8573611378669739, + "num_tokens": 640000741.0, + "step": 16776 + }, + { + "epoch": 2.134206843912988, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6315670013427734, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8643568158149719, + "num_tokens": 640040108.0, + "step": 16777 + }, + { + "epoch": 2.1343340541915787, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.693234920501709, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8640899658203125, + "num_tokens": 640080717.0, + "step": 16778 + }, + { + "epoch": 2.1344612644701693, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6709696054458618, + "learning_rate": 1e-06, + "loss": 0.3129, + "mean_token_accuracy": 0.8985536098480225, + "num_tokens": 640114013.0, + "step": 16779 + }, + { + "epoch": 2.13458847474876, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7074519395828247, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8722366094589233, + "num_tokens": 640151050.0, + "step": 16780 + }, + { + "epoch": 2.1347156850273503, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.614312767982483, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8690563440322876, + "num_tokens": 640191004.0, + "step": 16781 + }, + { + "epoch": 2.134842895305941, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6784000396728516, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8708168268203735, + "num_tokens": 640228748.0, + "step": 16782 + }, + { + "epoch": 2.1349701055845314, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.8027080297470093, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8694595694541931, + "num_tokens": 640264331.0, + "step": 16783 + }, + { + "epoch": 2.135097315863122, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5819774866104126, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8726062774658203, + "num_tokens": 640306149.0, + "step": 16784 + }, + { + "epoch": 2.1352245261417124, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.590065360069275, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8688499927520752, + "num_tokens": 640349004.0, + "step": 16785 + }, + { + "epoch": 2.135351736420303, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7276148796081543, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8626091480255127, + "num_tokens": 640385630.0, + "step": 16786 + }, + { + "epoch": 2.1354789466988935, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7758229970932007, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8784722685813904, + "num_tokens": 640417622.0, + "step": 16787 + }, + { + "epoch": 2.1356061569774836, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5415912866592407, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8662372827529907, + "num_tokens": 640460557.0, + "step": 16788 + }, + { + "epoch": 2.135733367256074, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5314064025878906, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8667295575141907, + "num_tokens": 640501744.0, + "step": 16789 + }, + { + "epoch": 2.1358605775346646, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6400816440582275, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8893687725067139, + "num_tokens": 640541119.0, + "step": 16790 + }, + { + "epoch": 2.135987787813255, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6799794435501099, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8873105049133301, + "num_tokens": 640579206.0, + "step": 16791 + }, + { + "epoch": 2.1361149980918457, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5115559101104736, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8755718469619751, + "num_tokens": 640625919.0, + "step": 16792 + }, + { + "epoch": 2.136242208370436, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.660914659500122, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8776398301124573, + "num_tokens": 640665926.0, + "step": 16793 + }, + { + "epoch": 2.1363694186490267, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.642888069152832, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8750581741333008, + "num_tokens": 640706637.0, + "step": 16794 + }, + { + "epoch": 2.1364966289276173, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.756386160850525, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8766039609909058, + "num_tokens": 640746523.0, + "step": 16795 + }, + { + "epoch": 2.136623839206208, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5263079404830933, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8727642297744751, + "num_tokens": 640792166.0, + "step": 16796 + }, + { + "epoch": 2.1367510494847983, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.714287281036377, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8768883943557739, + "num_tokens": 640827633.0, + "step": 16797 + }, + { + "epoch": 2.136878259763389, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.683329463005066, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8725589513778687, + "num_tokens": 640865727.0, + "step": 16798 + }, + { + "epoch": 2.1370054700419794, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7979031801223755, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8671296238899231, + "num_tokens": 640900678.0, + "step": 16799 + }, + { + "epoch": 2.13713268032057, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5901985168457031, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.877860426902771, + "num_tokens": 640944540.0, + "step": 16800 + }, + { + "epoch": 2.1372598905991604, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6149523258209229, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8976091146469116, + "num_tokens": 640985316.0, + "step": 16801 + }, + { + "epoch": 2.137387100877751, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6801646947860718, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8850620985031128, + "num_tokens": 641027706.0, + "step": 16802 + }, + { + "epoch": 2.1375143111563415, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7447768449783325, + "learning_rate": 1e-06, + "loss": 0.3277, + "mean_token_accuracy": 0.8938857316970825, + "num_tokens": 641061902.0, + "step": 16803 + }, + { + "epoch": 2.137641521434932, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.701368808746338, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.87911456823349, + "num_tokens": 641103497.0, + "step": 16804 + }, + { + "epoch": 2.1377687317135226, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.8960886001586914, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8620877861976624, + "num_tokens": 641136809.0, + "step": 16805 + }, + { + "epoch": 2.137895941992113, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6561981439590454, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8799576163291931, + "num_tokens": 641175724.0, + "step": 16806 + }, + { + "epoch": 2.1380231522707036, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7534794807434082, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8701207637786865, + "num_tokens": 641210098.0, + "step": 16807 + }, + { + "epoch": 2.138150362549294, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6993588209152222, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8793420791625977, + "num_tokens": 641243889.0, + "step": 16808 + }, + { + "epoch": 2.1382775728278847, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.754073143005371, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8825993537902832, + "num_tokens": 641279838.0, + "step": 16809 + }, + { + "epoch": 2.138404783106475, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7193248271942139, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8851761817932129, + "num_tokens": 641319375.0, + "step": 16810 + }, + { + "epoch": 2.1385319933850653, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7544565200805664, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8758920431137085, + "num_tokens": 641353785.0, + "step": 16811 + }, + { + "epoch": 2.1386592036636562, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7328559160232544, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8732016086578369, + "num_tokens": 641388081.0, + "step": 16812 + }, + { + "epoch": 2.1387864139422463, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6565582752227783, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8624762296676636, + "num_tokens": 641428363.0, + "step": 16813 + }, + { + "epoch": 2.138913624220837, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.800331473350525, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8702134490013123, + "num_tokens": 641460545.0, + "step": 16814 + }, + { + "epoch": 2.1390408344994274, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7178295850753784, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8651232719421387, + "num_tokens": 641499651.0, + "step": 16815 + }, + { + "epoch": 2.139168044778018, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5922982692718506, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.890121340751648, + "num_tokens": 641539862.0, + "step": 16816 + }, + { + "epoch": 2.1392952550566084, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7955164909362793, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8779586553573608, + "num_tokens": 641575524.0, + "step": 16817 + }, + { + "epoch": 2.139422465335199, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5345842838287354, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8711001873016357, + "num_tokens": 641621340.0, + "step": 16818 + }, + { + "epoch": 2.1395496756137895, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7365598678588867, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8705579042434692, + "num_tokens": 641657233.0, + "step": 16819 + }, + { + "epoch": 2.13967688589238, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5911260843276978, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8920592069625854, + "num_tokens": 641693740.0, + "step": 16820 + }, + { + "epoch": 2.1398040961709706, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.8188841342926025, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8750019073486328, + "num_tokens": 641731798.0, + "step": 16821 + }, + { + "epoch": 2.139931306449561, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.8019343614578247, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8846129775047302, + "num_tokens": 641762911.0, + "step": 16822 + }, + { + "epoch": 2.1400585167281516, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6659713983535767, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8701016306877136, + "num_tokens": 641802387.0, + "step": 16823 + }, + { + "epoch": 2.140185727006742, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.639122724533081, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.849112331867218, + "num_tokens": 641846133.0, + "step": 16824 + }, + { + "epoch": 2.1403129372853327, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7703962326049805, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8676286339759827, + "num_tokens": 641881730.0, + "step": 16825 + }, + { + "epoch": 2.140440147563923, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6002063751220703, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8782778382301331, + "num_tokens": 641922261.0, + "step": 16826 + }, + { + "epoch": 2.1405673578425137, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.64609956741333, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8786333799362183, + "num_tokens": 641961297.0, + "step": 16827 + }, + { + "epoch": 2.1406945681211043, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6140480041503906, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8826254606246948, + "num_tokens": 641999950.0, + "step": 16828 + }, + { + "epoch": 2.140821778399695, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5550127029418945, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8864204287528992, + "num_tokens": 642041255.0, + "step": 16829 + }, + { + "epoch": 2.1409489886782853, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6747041940689087, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8774558305740356, + "num_tokens": 642078730.0, + "step": 16830 + }, + { + "epoch": 2.141076198956876, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6426767110824585, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8864128589630127, + "num_tokens": 642116062.0, + "step": 16831 + }, + { + "epoch": 2.1412034092354664, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.8383721113204956, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8543936014175415, + "num_tokens": 642156642.0, + "step": 16832 + }, + { + "epoch": 2.141330619514057, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7212309837341309, + "learning_rate": 1e-06, + "loss": 0.3227, + "mean_token_accuracy": 0.8955367803573608, + "num_tokens": 642187895.0, + "step": 16833 + }, + { + "epoch": 2.1414578297926474, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6052037477493286, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8868683576583862, + "num_tokens": 642224162.0, + "step": 16834 + }, + { + "epoch": 2.141585040071238, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.63936448097229, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8708653450012207, + "num_tokens": 642266474.0, + "step": 16835 + }, + { + "epoch": 2.141712250349828, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.648381233215332, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8793829083442688, + "num_tokens": 642301416.0, + "step": 16836 + }, + { + "epoch": 2.141839460628419, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.8161444664001465, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8765823841094971, + "num_tokens": 642336235.0, + "step": 16837 + }, + { + "epoch": 2.141966670907009, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5742969512939453, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8849738836288452, + "num_tokens": 642377283.0, + "step": 16838 + }, + { + "epoch": 2.1420938811855996, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6595661640167236, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8612795472145081, + "num_tokens": 642418336.0, + "step": 16839 + }, + { + "epoch": 2.14222109146419, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.733917236328125, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8651825189590454, + "num_tokens": 642454480.0, + "step": 16840 + }, + { + "epoch": 2.1423483017427807, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5970849990844727, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8827202916145325, + "num_tokens": 642493677.0, + "step": 16841 + }, + { + "epoch": 2.142475512021371, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6829475164413452, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8670063018798828, + "num_tokens": 642535570.0, + "step": 16842 + }, + { + "epoch": 2.1426027222999617, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6932345628738403, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8660863637924194, + "num_tokens": 642574941.0, + "step": 16843 + }, + { + "epoch": 2.1427299325785523, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7302510738372803, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8670515418052673, + "num_tokens": 642610069.0, + "step": 16844 + }, + { + "epoch": 2.142857142857143, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6659942865371704, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8799476027488708, + "num_tokens": 642649156.0, + "step": 16845 + }, + { + "epoch": 2.1429843531357333, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.713903784751892, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8813862800598145, + "num_tokens": 642684643.0, + "step": 16846 + }, + { + "epoch": 2.143111563414324, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6452921628952026, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.879632830619812, + "num_tokens": 642726955.0, + "step": 16847 + }, + { + "epoch": 2.1432387736929144, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6302199363708496, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8894288539886475, + "num_tokens": 642763573.0, + "step": 16848 + }, + { + "epoch": 2.143365983971505, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6797144412994385, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8824691772460938, + "num_tokens": 642798909.0, + "step": 16849 + }, + { + "epoch": 2.1434931942500954, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7285358905792236, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8730921149253845, + "num_tokens": 642831835.0, + "step": 16850 + }, + { + "epoch": 2.143620404528686, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6922112703323364, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8861767053604126, + "num_tokens": 642868207.0, + "step": 16851 + }, + { + "epoch": 2.1437476148072765, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.751786470413208, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8601665496826172, + "num_tokens": 642907689.0, + "step": 16852 + }, + { + "epoch": 2.143874825085867, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.540130853652954, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8742062449455261, + "num_tokens": 642952275.0, + "step": 16853 + }, + { + "epoch": 2.1440020353644575, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.861316442489624, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8784987926483154, + "num_tokens": 642983942.0, + "step": 16854 + }, + { + "epoch": 2.144129245643048, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.4987176656723022, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8881783485412598, + "num_tokens": 643024912.0, + "step": 16855 + }, + { + "epoch": 2.1442564559216386, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6029692888259888, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8883959650993347, + "num_tokens": 643059698.0, + "step": 16856 + }, + { + "epoch": 2.144383666200229, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6359511613845825, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8778904676437378, + "num_tokens": 643096068.0, + "step": 16857 + }, + { + "epoch": 2.1445108764788197, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6884963512420654, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8716075420379639, + "num_tokens": 643132695.0, + "step": 16858 + }, + { + "epoch": 2.14463808675741, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7287864685058594, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8520210981369019, + "num_tokens": 643171464.0, + "step": 16859 + }, + { + "epoch": 2.1447652970360007, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5694550275802612, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8830910921096802, + "num_tokens": 643213740.0, + "step": 16860 + }, + { + "epoch": 2.144892507314591, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6272190809249878, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8726651668548584, + "num_tokens": 643250854.0, + "step": 16861 + }, + { + "epoch": 2.1450197175931813, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6071237325668335, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8838666677474976, + "num_tokens": 643290215.0, + "step": 16862 + }, + { + "epoch": 2.145146927871772, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5194381475448608, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8825778961181641, + "num_tokens": 643330209.0, + "step": 16863 + }, + { + "epoch": 2.1452741381503624, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6249020099639893, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8690616488456726, + "num_tokens": 643369432.0, + "step": 16864 + }, + { + "epoch": 2.145401348428953, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.595834493637085, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8756463527679443, + "num_tokens": 643410468.0, + "step": 16865 + }, + { + "epoch": 2.1455285587075434, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.636247992515564, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8768000602722168, + "num_tokens": 643449450.0, + "step": 16866 + }, + { + "epoch": 2.145655768986134, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7986736297607422, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8698029518127441, + "num_tokens": 643485897.0, + "step": 16867 + }, + { + "epoch": 2.1457829792647245, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7964807748794556, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8772228956222534, + "num_tokens": 643521284.0, + "step": 16868 + }, + { + "epoch": 2.145910189543315, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.7206209897994995, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8623563647270203, + "num_tokens": 643558270.0, + "step": 16869 + }, + { + "epoch": 2.1460373998219056, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.740767240524292, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8607487082481384, + "num_tokens": 643596225.0, + "step": 16870 + }, + { + "epoch": 2.146164610100496, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5341598987579346, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8837825655937195, + "num_tokens": 643639394.0, + "step": 16871 + }, + { + "epoch": 2.1462918203790866, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.826737403869629, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8659254312515259, + "num_tokens": 643671745.0, + "step": 16872 + }, + { + "epoch": 2.146419030657677, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6446192264556885, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8667605519294739, + "num_tokens": 643710085.0, + "step": 16873 + }, + { + "epoch": 2.1465462409362677, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6512876749038696, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8770759105682373, + "num_tokens": 643747866.0, + "step": 16874 + }, + { + "epoch": 2.146673451214858, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6200666427612305, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8639422655105591, + "num_tokens": 643787349.0, + "step": 16875 + }, + { + "epoch": 2.1468006614934487, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.582659125328064, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8857983350753784, + "num_tokens": 643823136.0, + "step": 16876 + }, + { + "epoch": 2.1469278717720393, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6501250267028809, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8832119703292847, + "num_tokens": 643858397.0, + "step": 16877 + }, + { + "epoch": 2.14705508205063, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.5984612703323364, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8653571605682373, + "num_tokens": 643899917.0, + "step": 16878 + }, + { + "epoch": 2.1471822923292203, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.596071720123291, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8712803721427917, + "num_tokens": 643936898.0, + "step": 16879 + }, + { + "epoch": 2.147309502607811, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.657984733581543, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.880479097366333, + "num_tokens": 643978360.0, + "step": 16880 + }, + { + "epoch": 2.1474367128864014, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6758557558059692, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8716830611228943, + "num_tokens": 644012590.0, + "step": 16881 + }, + { + "epoch": 2.147563923164992, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.631423830986023, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8601746559143066, + "num_tokens": 644054581.0, + "step": 16882 + }, + { + "epoch": 2.1476911334435824, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6669089794158936, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8869763612747192, + "num_tokens": 644089744.0, + "step": 16883 + }, + { + "epoch": 2.147818343722173, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.538714051246643, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8874125480651855, + "num_tokens": 644130520.0, + "step": 16884 + }, + { + "epoch": 2.1479455540007635, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.784234881401062, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8692131638526917, + "num_tokens": 644165440.0, + "step": 16885 + }, + { + "epoch": 2.1480727642793536, + "ewc_loss": 2.658367156982422e-05, + "grad_norm": 1.6511238813400269, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8693286180496216, + "num_tokens": 644203620.0, + "step": 16886 + }, + { + "epoch": 2.148199974557944, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7823153734207153, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8741014003753662, + "num_tokens": 644237466.0, + "step": 16887 + }, + { + "epoch": 2.1483271848365346, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.698157548904419, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8707057237625122, + "num_tokens": 644278790.0, + "step": 16888 + }, + { + "epoch": 2.148454395115125, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6596020460128784, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8806371688842773, + "num_tokens": 644316504.0, + "step": 16889 + }, + { + "epoch": 2.1485816053937157, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6080539226531982, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8816600441932678, + "num_tokens": 644357513.0, + "step": 16890 + }, + { + "epoch": 2.148708815672306, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.777899146080017, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8826451301574707, + "num_tokens": 644391578.0, + "step": 16891 + }, + { + "epoch": 2.1488360259508967, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.60818350315094, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8891631364822388, + "num_tokens": 644429015.0, + "step": 16892 + }, + { + "epoch": 2.1489632362294873, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.568874478340149, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8805426359176636, + "num_tokens": 644468688.0, + "step": 16893 + }, + { + "epoch": 2.149090446508078, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6608812808990479, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8593884110450745, + "num_tokens": 644510468.0, + "step": 16894 + }, + { + "epoch": 2.1492176567866683, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.398352861404419, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8901345133781433, + "num_tokens": 644558085.0, + "step": 16895 + }, + { + "epoch": 2.149344867065259, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.851816177368164, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8588155508041382, + "num_tokens": 644594409.0, + "step": 16896 + }, + { + "epoch": 2.1494720773438494, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5813449621200562, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.874893307685852, + "num_tokens": 644635329.0, + "step": 16897 + }, + { + "epoch": 2.14959928762244, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.557403326034546, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8752740621566772, + "num_tokens": 644677288.0, + "step": 16898 + }, + { + "epoch": 2.1497264979010304, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5925456285476685, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8697336316108704, + "num_tokens": 644721824.0, + "step": 16899 + }, + { + "epoch": 2.149853708179621, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6133372783660889, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.862573504447937, + "num_tokens": 644763633.0, + "step": 16900 + }, + { + "epoch": 2.1499809184582115, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6010181903839111, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8744905591011047, + "num_tokens": 644803894.0, + "step": 16901 + }, + { + "epoch": 2.150108128736802, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7701092958450317, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8714396357536316, + "num_tokens": 644843449.0, + "step": 16902 + }, + { + "epoch": 2.1502353390153925, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7040051221847534, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8790995478630066, + "num_tokens": 644878047.0, + "step": 16903 + }, + { + "epoch": 2.150362549293983, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.687354564666748, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8694539070129395, + "num_tokens": 644915981.0, + "step": 16904 + }, + { + "epoch": 2.1504897595725736, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7458274364471436, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8758611083030701, + "num_tokens": 644948890.0, + "step": 16905 + }, + { + "epoch": 2.150616969851164, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7016913890838623, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.880170464515686, + "num_tokens": 644983396.0, + "step": 16906 + }, + { + "epoch": 2.1507441801297547, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7617512941360474, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8803731203079224, + "num_tokens": 645016125.0, + "step": 16907 + }, + { + "epoch": 2.150871390408345, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.716821551322937, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8776962161064148, + "num_tokens": 645050622.0, + "step": 16908 + }, + { + "epoch": 2.1509986006869353, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.8356860876083374, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8714647889137268, + "num_tokens": 645081577.0, + "step": 16909 + }, + { + "epoch": 2.1511258109655262, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6185834407806396, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8819820880889893, + "num_tokens": 645118896.0, + "step": 16910 + }, + { + "epoch": 2.1512530212441163, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5801047086715698, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.879280686378479, + "num_tokens": 645160613.0, + "step": 16911 + }, + { + "epoch": 2.151380231522707, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.642188549041748, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.885239839553833, + "num_tokens": 645202111.0, + "step": 16912 + }, + { + "epoch": 2.1515074418012974, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6408367156982422, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8797374367713928, + "num_tokens": 645242840.0, + "step": 16913 + }, + { + "epoch": 2.151634652079888, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7160420417785645, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8796555399894714, + "num_tokens": 645282917.0, + "step": 16914 + }, + { + "epoch": 2.1517618623584784, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6582021713256836, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8760393857955933, + "num_tokens": 645321722.0, + "step": 16915 + }, + { + "epoch": 2.151889072637069, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.610270380973816, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8803409337997437, + "num_tokens": 645360366.0, + "step": 16916 + }, + { + "epoch": 2.1520162829156595, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5622732639312744, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8775002360343933, + "num_tokens": 645401248.0, + "step": 16917 + }, + { + "epoch": 2.15214349319425, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6432805061340332, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8668127059936523, + "num_tokens": 645441750.0, + "step": 16918 + }, + { + "epoch": 2.1522707034728406, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6965018510818481, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8512369990348816, + "num_tokens": 645481044.0, + "step": 16919 + }, + { + "epoch": 2.152397913751431, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.9959006309509277, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8681780099868774, + "num_tokens": 645511358.0, + "step": 16920 + }, + { + "epoch": 2.1525251240300216, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.9073147773742676, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8598044514656067, + "num_tokens": 645543801.0, + "step": 16921 + }, + { + "epoch": 2.152652334308612, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6770213842391968, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8671432137489319, + "num_tokens": 645580796.0, + "step": 16922 + }, + { + "epoch": 2.1527795445872027, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6496891975402832, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.885106086730957, + "num_tokens": 645616933.0, + "step": 16923 + }, + { + "epoch": 2.152906754865793, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.780293583869934, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8593028783798218, + "num_tokens": 645651932.0, + "step": 16924 + }, + { + "epoch": 2.1530339651443837, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5736289024353027, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8716408014297485, + "num_tokens": 645692437.0, + "step": 16925 + }, + { + "epoch": 2.1531611754229742, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6119483709335327, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8761965036392212, + "num_tokens": 645731852.0, + "step": 16926 + }, + { + "epoch": 2.1532883857015648, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7700425386428833, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8741081953048706, + "num_tokens": 645765207.0, + "step": 16927 + }, + { + "epoch": 2.1534155959801553, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6629225015640259, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8802331686019897, + "num_tokens": 645805236.0, + "step": 16928 + }, + { + "epoch": 2.153542806258746, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6222331523895264, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8677220344543457, + "num_tokens": 645847128.0, + "step": 16929 + }, + { + "epoch": 2.1536700165373364, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.8182320594787598, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8670989274978638, + "num_tokens": 645884067.0, + "step": 16930 + }, + { + "epoch": 2.153797226815927, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5835570096969604, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8823697566986084, + "num_tokens": 645925533.0, + "step": 16931 + }, + { + "epoch": 2.1539244370945174, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5558418035507202, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8847326040267944, + "num_tokens": 645967318.0, + "step": 16932 + }, + { + "epoch": 2.154051647373108, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7626676559448242, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8721264004707336, + "num_tokens": 646000460.0, + "step": 16933 + }, + { + "epoch": 2.154178857651698, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6076810359954834, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8667904138565063, + "num_tokens": 646043689.0, + "step": 16934 + }, + { + "epoch": 2.1543060679302886, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6580208539962769, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8685874938964844, + "num_tokens": 646083237.0, + "step": 16935 + }, + { + "epoch": 2.154433278208879, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.8942655324935913, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8529866933822632, + "num_tokens": 646115576.0, + "step": 16936 + }, + { + "epoch": 2.1545604884874696, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5748579502105713, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8705929517745972, + "num_tokens": 646157958.0, + "step": 16937 + }, + { + "epoch": 2.15468769876606, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6191222667694092, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8685956001281738, + "num_tokens": 646200171.0, + "step": 16938 + }, + { + "epoch": 2.1548149090446507, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6023370027542114, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8748699426651001, + "num_tokens": 646243880.0, + "step": 16939 + }, + { + "epoch": 2.154942119323241, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.740699052810669, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8722133040428162, + "num_tokens": 646279339.0, + "step": 16940 + }, + { + "epoch": 2.1550693296018317, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.633294701576233, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8759213089942932, + "num_tokens": 646316773.0, + "step": 16941 + }, + { + "epoch": 2.1551965398804223, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7340764999389648, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8784512877464294, + "num_tokens": 646356468.0, + "step": 16942 + }, + { + "epoch": 2.155323750159013, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6732606887817383, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8769261837005615, + "num_tokens": 646394858.0, + "step": 16943 + }, + { + "epoch": 2.1554509604376033, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5402065515518188, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8840107917785645, + "num_tokens": 646433513.0, + "step": 16944 + }, + { + "epoch": 2.155578170716194, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.57674241065979, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8829487562179565, + "num_tokens": 646474645.0, + "step": 16945 + }, + { + "epoch": 2.1557053809947844, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.9461743831634521, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8823848962783813, + "num_tokens": 646505953.0, + "step": 16946 + }, + { + "epoch": 2.155832591273375, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5941250324249268, + "learning_rate": 1e-06, + "loss": 0.344, + "mean_token_accuracy": 0.8839595913887024, + "num_tokens": 646544187.0, + "step": 16947 + }, + { + "epoch": 2.1559598015519654, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7322231531143188, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8836753368377686, + "num_tokens": 646576774.0, + "step": 16948 + }, + { + "epoch": 2.156087011830556, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.8021479845046997, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8711161017417908, + "num_tokens": 646610525.0, + "step": 16949 + }, + { + "epoch": 2.1562142221091465, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7261576652526855, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8721567392349243, + "num_tokens": 646644677.0, + "step": 16950 + }, + { + "epoch": 2.156341432387737, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6565054655075073, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8826473951339722, + "num_tokens": 646683525.0, + "step": 16951 + }, + { + "epoch": 2.1564686426663275, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.804796576499939, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8771003484725952, + "num_tokens": 646717656.0, + "step": 16952 + }, + { + "epoch": 2.156595852944918, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7370785474777222, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8798855543136597, + "num_tokens": 646754223.0, + "step": 16953 + }, + { + "epoch": 2.1567230632235086, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.8895353078842163, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8833714723587036, + "num_tokens": 646787523.0, + "step": 16954 + }, + { + "epoch": 2.156850273502099, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5150140523910522, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8845729827880859, + "num_tokens": 646829525.0, + "step": 16955 + }, + { + "epoch": 2.1569774837806897, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7494782209396362, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8825690746307373, + "num_tokens": 646864831.0, + "step": 16956 + }, + { + "epoch": 2.15710469405928, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6717673540115356, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8748082518577576, + "num_tokens": 646903983.0, + "step": 16957 + }, + { + "epoch": 2.1572319043378707, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7105964422225952, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8823660612106323, + "num_tokens": 646941366.0, + "step": 16958 + }, + { + "epoch": 2.157359114616461, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6302647590637207, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8756687641143799, + "num_tokens": 646983976.0, + "step": 16959 + }, + { + "epoch": 2.1574863248950513, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 2.0816845893859863, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8618208765983582, + "num_tokens": 647012858.0, + "step": 16960 + }, + { + "epoch": 2.157613535173642, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.5571714639663696, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8691290616989136, + "num_tokens": 647052953.0, + "step": 16961 + }, + { + "epoch": 2.1577407454522324, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.510890007019043, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8860223293304443, + "num_tokens": 647098206.0, + "step": 16962 + }, + { + "epoch": 2.157867955730823, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6941044330596924, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8873720765113831, + "num_tokens": 647133242.0, + "step": 16963 + }, + { + "epoch": 2.1579951660094134, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6181398630142212, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8771753907203674, + "num_tokens": 647173741.0, + "step": 16964 + }, + { + "epoch": 2.158122376288004, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7790621519088745, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8742775917053223, + "num_tokens": 647208210.0, + "step": 16965 + }, + { + "epoch": 2.1582495865665945, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6035360097885132, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8785984516143799, + "num_tokens": 647250903.0, + "step": 16966 + }, + { + "epoch": 2.158376796845185, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.611307978630066, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8676434755325317, + "num_tokens": 647288425.0, + "step": 16967 + }, + { + "epoch": 2.1585040071237755, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.518784761428833, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.865849494934082, + "num_tokens": 647332550.0, + "step": 16968 + }, + { + "epoch": 2.158631217402366, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7395669221878052, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8816791772842407, + "num_tokens": 647367449.0, + "step": 16969 + }, + { + "epoch": 2.1587584276809566, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.7717448472976685, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8727743625640869, + "num_tokens": 647406049.0, + "step": 16970 + }, + { + "epoch": 2.158885637959547, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6403908729553223, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8749215602874756, + "num_tokens": 647446011.0, + "step": 16971 + }, + { + "epoch": 2.1590128482381377, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.8110285997390747, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8784385919570923, + "num_tokens": 647480260.0, + "step": 16972 + }, + { + "epoch": 2.159140058516728, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6831625699996948, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8655477166175842, + "num_tokens": 647521318.0, + "step": 16973 + }, + { + "epoch": 2.1592672687953187, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6705560684204102, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.86506587266922, + "num_tokens": 647560417.0, + "step": 16974 + }, + { + "epoch": 2.1593944790739092, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.4316617250442505, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8799548745155334, + "num_tokens": 647607083.0, + "step": 16975 + }, + { + "epoch": 2.1595216893524998, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 2.0023064613342285, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8734555244445801, + "num_tokens": 647636906.0, + "step": 16976 + }, + { + "epoch": 2.1596488996310903, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.523148775100708, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8854799270629883, + "num_tokens": 647677435.0, + "step": 16977 + }, + { + "epoch": 2.159776109909681, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.6777160167694092, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8696462512016296, + "num_tokens": 647715539.0, + "step": 16978 + }, + { + "epoch": 2.1599033201882714, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.9258509874343872, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8575098514556885, + "num_tokens": 647749070.0, + "step": 16979 + }, + { + "epoch": 2.160030530466862, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5848352909088135, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8856386542320251, + "num_tokens": 647788323.0, + "step": 16980 + }, + { + "epoch": 2.1601577407454524, + "ewc_loss": 2.6702880859375e-05, + "grad_norm": 1.817725419998169, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8826724290847778, + "num_tokens": 647819336.0, + "step": 16981 + }, + { + "epoch": 2.160284951024043, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6599762439727783, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.876376748085022, + "num_tokens": 647857801.0, + "step": 16982 + }, + { + "epoch": 2.1604121613026335, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.8496843576431274, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8954440951347351, + "num_tokens": 647892055.0, + "step": 16983 + }, + { + "epoch": 2.1605393715812236, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.724726915359497, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8659485578536987, + "num_tokens": 647931417.0, + "step": 16984 + }, + { + "epoch": 2.160666581859814, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7348240613937378, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8816366791725159, + "num_tokens": 647964677.0, + "step": 16985 + }, + { + "epoch": 2.1607937921384046, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.747772216796875, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.876829981803894, + "num_tokens": 647999338.0, + "step": 16986 + }, + { + "epoch": 2.160921002416995, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6694762706756592, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8628329038619995, + "num_tokens": 648040944.0, + "step": 16987 + }, + { + "epoch": 2.1610482126955857, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7846599817276, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8705781102180481, + "num_tokens": 648078983.0, + "step": 16988 + }, + { + "epoch": 2.161175422974176, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5470231771469116, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8602254390716553, + "num_tokens": 648123165.0, + "step": 16989 + }, + { + "epoch": 2.1613026332527667, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7075005769729614, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8960270881652832, + "num_tokens": 648157466.0, + "step": 16990 + }, + { + "epoch": 2.1614298435313573, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5297796726226807, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8805079460144043, + "num_tokens": 648200899.0, + "step": 16991 + }, + { + "epoch": 2.161557053809948, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.655458927154541, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8686830997467041, + "num_tokens": 648240267.0, + "step": 16992 + }, + { + "epoch": 2.1616842640885383, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6636319160461426, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8815021514892578, + "num_tokens": 648275837.0, + "step": 16993 + }, + { + "epoch": 2.161811474367129, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7672218084335327, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.870692253112793, + "num_tokens": 648311170.0, + "step": 16994 + }, + { + "epoch": 2.1619386846457194, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7804416418075562, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8702467679977417, + "num_tokens": 648350045.0, + "step": 16995 + }, + { + "epoch": 2.16206589492431, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6431230306625366, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8520803451538086, + "num_tokens": 648391297.0, + "step": 16996 + }, + { + "epoch": 2.1621931052029004, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5368046760559082, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8732357025146484, + "num_tokens": 648434167.0, + "step": 16997 + }, + { + "epoch": 2.162320315481491, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.8843331336975098, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.868118166923523, + "num_tokens": 648467693.0, + "step": 16998 + }, + { + "epoch": 2.1624475257600815, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5510103702545166, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8794133067131042, + "num_tokens": 648508538.0, + "step": 16999 + }, + { + "epoch": 2.162574736038672, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.670960545539856, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8854475021362305, + "num_tokens": 648545024.0, + "step": 17000 + }, + { + "epoch": 2.1627019463172625, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7217808961868286, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8779005408287048, + "num_tokens": 648584865.0, + "step": 17001 + }, + { + "epoch": 2.162829156595853, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.512359857559204, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8849932551383972, + "num_tokens": 648625832.0, + "step": 17002 + }, + { + "epoch": 2.1629563668744436, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7523001432418823, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8748819231987, + "num_tokens": 648661182.0, + "step": 17003 + }, + { + "epoch": 2.163083577153034, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.9098776578903198, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8750824928283691, + "num_tokens": 648694748.0, + "step": 17004 + }, + { + "epoch": 2.1632107874316246, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.689836025238037, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.867707371711731, + "num_tokens": 648733219.0, + "step": 17005 + }, + { + "epoch": 2.163337997710215, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.584133505821228, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8711592555046082, + "num_tokens": 648774955.0, + "step": 17006 + }, + { + "epoch": 2.1634652079888053, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5663015842437744, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8839826583862305, + "num_tokens": 648812363.0, + "step": 17007 + }, + { + "epoch": 2.1635924182673962, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7383641004562378, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.883083164691925, + "num_tokens": 648844203.0, + "step": 17008 + }, + { + "epoch": 2.1637196285459863, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6336994171142578, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8708411455154419, + "num_tokens": 648885415.0, + "step": 17009 + }, + { + "epoch": 2.163846838824577, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.546807050704956, + "learning_rate": 1e-06, + "loss": 0.3371, + "mean_token_accuracy": 0.8888557553291321, + "num_tokens": 648925028.0, + "step": 17010 + }, + { + "epoch": 2.1639740491031674, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.448302984237671, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.868789553642273, + "num_tokens": 648968507.0, + "step": 17011 + }, + { + "epoch": 2.164101259381758, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5469640493392944, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8785358667373657, + "num_tokens": 649006787.0, + "step": 17012 + }, + { + "epoch": 2.1642284696603484, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.628633737564087, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.881106436252594, + "num_tokens": 649042303.0, + "step": 17013 + }, + { + "epoch": 2.164355679938939, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5516661405563354, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8785154223442078, + "num_tokens": 649081038.0, + "step": 17014 + }, + { + "epoch": 2.1644828902175295, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7523515224456787, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8659424781799316, + "num_tokens": 649121450.0, + "step": 17015 + }, + { + "epoch": 2.16461010049612, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.610090970993042, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.873905599117279, + "num_tokens": 649159080.0, + "step": 17016 + }, + { + "epoch": 2.1647373107747105, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.757168173789978, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8871906399726868, + "num_tokens": 649190084.0, + "step": 17017 + }, + { + "epoch": 2.164864521053301, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.9411851167678833, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8721569180488586, + "num_tokens": 649219087.0, + "step": 17018 + }, + { + "epoch": 2.1649917313318916, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6930711269378662, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.88343346118927, + "num_tokens": 649254639.0, + "step": 17019 + }, + { + "epoch": 2.165118941610482, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7656573057174683, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8783642053604126, + "num_tokens": 649290140.0, + "step": 17020 + }, + { + "epoch": 2.1652461518890727, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7192155122756958, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8803274631500244, + "num_tokens": 649323938.0, + "step": 17021 + }, + { + "epoch": 2.165373362167663, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5448849201202393, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8849494457244873, + "num_tokens": 649367579.0, + "step": 17022 + }, + { + "epoch": 2.1655005724462537, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6545501947402954, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.875640869140625, + "num_tokens": 649404890.0, + "step": 17023 + }, + { + "epoch": 2.1656277827248442, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.553006649017334, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8750827312469482, + "num_tokens": 649447221.0, + "step": 17024 + }, + { + "epoch": 2.1657549930034348, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7294038534164429, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8719119429588318, + "num_tokens": 649487465.0, + "step": 17025 + }, + { + "epoch": 2.1658822032820253, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7016916275024414, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8863627910614014, + "num_tokens": 649521489.0, + "step": 17026 + }, + { + "epoch": 2.166009413560616, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.645478367805481, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8753662705421448, + "num_tokens": 649558363.0, + "step": 17027 + }, + { + "epoch": 2.1661366238392064, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.609863042831421, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8752918243408203, + "num_tokens": 649597285.0, + "step": 17028 + }, + { + "epoch": 2.166263834117797, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6322296857833862, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8815447688102722, + "num_tokens": 649634933.0, + "step": 17029 + }, + { + "epoch": 2.1663910443963874, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7522650957107544, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8804060220718384, + "num_tokens": 649672513.0, + "step": 17030 + }, + { + "epoch": 2.166518254674978, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7911375761032104, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8714032173156738, + "num_tokens": 649705505.0, + "step": 17031 + }, + { + "epoch": 2.166645464953568, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6951007843017578, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8805118799209595, + "num_tokens": 649745020.0, + "step": 17032 + }, + { + "epoch": 2.1667726752321586, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6332775354385376, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8730040788650513, + "num_tokens": 649789142.0, + "step": 17033 + }, + { + "epoch": 2.166899885510749, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 2.021240234375, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8659461140632629, + "num_tokens": 649820917.0, + "step": 17034 + }, + { + "epoch": 2.1670270957893396, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5282005071640015, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8834783434867859, + "num_tokens": 649862745.0, + "step": 17035 + }, + { + "epoch": 2.16715430606793, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7376857995986938, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8624324798583984, + "num_tokens": 649899562.0, + "step": 17036 + }, + { + "epoch": 2.1672815163465207, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7869449853897095, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8702080249786377, + "num_tokens": 649935023.0, + "step": 17037 + }, + { + "epoch": 2.167408726625111, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.609020471572876, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8645628690719604, + "num_tokens": 649971986.0, + "step": 17038 + }, + { + "epoch": 2.1675359369037017, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.63985013961792, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8573439121246338, + "num_tokens": 650016623.0, + "step": 17039 + }, + { + "epoch": 2.1676631471822922, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5980439186096191, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8872865438461304, + "num_tokens": 650060768.0, + "step": 17040 + }, + { + "epoch": 2.1677903574608828, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5562901496887207, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8798441886901855, + "num_tokens": 650100339.0, + "step": 17041 + }, + { + "epoch": 2.1679175677394733, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7885773181915283, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8725957274436951, + "num_tokens": 650133694.0, + "step": 17042 + }, + { + "epoch": 2.168044778018064, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6245464086532593, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8851358294487, + "num_tokens": 650170405.0, + "step": 17043 + }, + { + "epoch": 2.1681719882966544, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.512255311012268, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8772686719894409, + "num_tokens": 650210366.0, + "step": 17044 + }, + { + "epoch": 2.168299198575245, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6832046508789062, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8755447864532471, + "num_tokens": 650253170.0, + "step": 17045 + }, + { + "epoch": 2.1684264088538354, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6227669715881348, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8753624558448792, + "num_tokens": 650294940.0, + "step": 17046 + }, + { + "epoch": 2.168553619132426, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.8207625150680542, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8666210174560547, + "num_tokens": 650327317.0, + "step": 17047 + }, + { + "epoch": 2.1686808294110165, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7306736707687378, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8786575794219971, + "num_tokens": 650359235.0, + "step": 17048 + }, + { + "epoch": 2.168808039689607, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6814619302749634, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8665292263031006, + "num_tokens": 650400157.0, + "step": 17049 + }, + { + "epoch": 2.1689352499681975, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7414014339447021, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8815957307815552, + "num_tokens": 650438514.0, + "step": 17050 + }, + { + "epoch": 2.169062460246788, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7235815525054932, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8747296333312988, + "num_tokens": 650476531.0, + "step": 17051 + }, + { + "epoch": 2.1691896705253786, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.8457001447677612, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8506702184677124, + "num_tokens": 650514530.0, + "step": 17052 + }, + { + "epoch": 2.169316880803969, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.9505152702331543, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8863576650619507, + "num_tokens": 650549698.0, + "step": 17053 + }, + { + "epoch": 2.1694440910825596, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6250646114349365, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8884082436561584, + "num_tokens": 650587263.0, + "step": 17054 + }, + { + "epoch": 2.16957130136115, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5333051681518555, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8784870505332947, + "num_tokens": 650630000.0, + "step": 17055 + }, + { + "epoch": 2.1696985116397407, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5889009237289429, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8689782023429871, + "num_tokens": 650672865.0, + "step": 17056 + }, + { + "epoch": 2.169825721918331, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7215806245803833, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8792872428894043, + "num_tokens": 650711928.0, + "step": 17057 + }, + { + "epoch": 2.1699529321969213, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.7426791191101074, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8726378083229065, + "num_tokens": 650750169.0, + "step": 17058 + }, + { + "epoch": 2.170080142475512, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6067134141921997, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8724822998046875, + "num_tokens": 650792400.0, + "step": 17059 + }, + { + "epoch": 2.1702073527541024, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.6444597244262695, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8807299137115479, + "num_tokens": 650831320.0, + "step": 17060 + }, + { + "epoch": 2.170334563032693, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7600171566009521, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8778058886528015, + "num_tokens": 650862864.0, + "step": 17061 + }, + { + "epoch": 2.1704617733112834, + "ewc_loss": 2.682209014892578e-05, + "grad_norm": 1.5394002199172974, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8644238114356995, + "num_tokens": 650907542.0, + "step": 17062 + }, + { + "epoch": 2.170588983589874, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6332818269729614, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8573076128959656, + "num_tokens": 650951097.0, + "step": 17063 + }, + { + "epoch": 2.1707161938684645, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6277093887329102, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8795552849769592, + "num_tokens": 650987702.0, + "step": 17064 + }, + { + "epoch": 2.170843404147055, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7307007312774658, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8758557438850403, + "num_tokens": 651024668.0, + "step": 17065 + }, + { + "epoch": 2.1709706144256455, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6431376934051514, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8795351982116699, + "num_tokens": 651061687.0, + "step": 17066 + }, + { + "epoch": 2.171097824704236, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7616450786590576, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8859424591064453, + "num_tokens": 651100094.0, + "step": 17067 + }, + { + "epoch": 2.1712250349828266, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5198802947998047, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8767383098602295, + "num_tokens": 651142935.0, + "step": 17068 + }, + { + "epoch": 2.171352245261417, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5699635744094849, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8735811710357666, + "num_tokens": 651184280.0, + "step": 17069 + }, + { + "epoch": 2.1714794555400077, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6534119844436646, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8895804286003113, + "num_tokens": 651220808.0, + "step": 17070 + }, + { + "epoch": 2.171606665818598, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8786169290542603, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8599061965942383, + "num_tokens": 651255666.0, + "step": 17071 + }, + { + "epoch": 2.1717338760971887, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.557780385017395, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.89312744140625, + "num_tokens": 651290912.0, + "step": 17072 + }, + { + "epoch": 2.1718610863757792, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5742120742797852, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8751306533813477, + "num_tokens": 651330778.0, + "step": 17073 + }, + { + "epoch": 2.1719882966543698, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.9179750680923462, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8745511174201965, + "num_tokens": 651361329.0, + "step": 17074 + }, + { + "epoch": 2.1721155069329603, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6970492601394653, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8869168758392334, + "num_tokens": 651393546.0, + "step": 17075 + }, + { + "epoch": 2.172242717211551, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7637122869491577, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8687564730644226, + "num_tokens": 651429976.0, + "step": 17076 + }, + { + "epoch": 2.1723699274901414, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6940293312072754, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8798611164093018, + "num_tokens": 651465460.0, + "step": 17077 + }, + { + "epoch": 2.172497137768732, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7340008020401, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8799393773078918, + "num_tokens": 651500091.0, + "step": 17078 + }, + { + "epoch": 2.1726243480473224, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5312750339508057, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8834325671195984, + "num_tokens": 651541785.0, + "step": 17079 + }, + { + "epoch": 2.172751558325913, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6316677331924438, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8811508417129517, + "num_tokens": 651578948.0, + "step": 17080 + }, + { + "epoch": 2.1728787686045035, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.714149832725525, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8772506713867188, + "num_tokens": 651614435.0, + "step": 17081 + }, + { + "epoch": 2.1730059788830935, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7002912759780884, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8754518628120422, + "num_tokens": 651654107.0, + "step": 17082 + }, + { + "epoch": 2.173133189161684, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6882808208465576, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8621358871459961, + "num_tokens": 651693259.0, + "step": 17083 + }, + { + "epoch": 2.1732603994402746, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.632419466972351, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8771377801895142, + "num_tokens": 651734535.0, + "step": 17084 + }, + { + "epoch": 2.173387609718865, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6599832773208618, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8606289029121399, + "num_tokens": 651775539.0, + "step": 17085 + }, + { + "epoch": 2.1735148199974557, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6551687717437744, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8667573928833008, + "num_tokens": 651810869.0, + "step": 17086 + }, + { + "epoch": 2.173642030276046, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7304702997207642, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8731226921081543, + "num_tokens": 651846285.0, + "step": 17087 + }, + { + "epoch": 2.1737692405546367, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.4914599657058716, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8902246952056885, + "num_tokens": 651888825.0, + "step": 17088 + }, + { + "epoch": 2.1738964508332272, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7166824340820312, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8653037548065186, + "num_tokens": 651927874.0, + "step": 17089 + }, + { + "epoch": 2.1740236611118178, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5554227828979492, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8740721940994263, + "num_tokens": 651969294.0, + "step": 17090 + }, + { + "epoch": 2.1741508713904083, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5203205347061157, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8736315965652466, + "num_tokens": 652015123.0, + "step": 17091 + }, + { + "epoch": 2.174278081668999, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7010111808776855, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8868680000305176, + "num_tokens": 652050261.0, + "step": 17092 + }, + { + "epoch": 2.1744052919475894, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.716729760169983, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8655648231506348, + "num_tokens": 652088228.0, + "step": 17093 + }, + { + "epoch": 2.17453250222618, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5717231035232544, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8700015544891357, + "num_tokens": 652128273.0, + "step": 17094 + }, + { + "epoch": 2.1746597125047704, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5819530487060547, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.868066668510437, + "num_tokens": 652169022.0, + "step": 17095 + }, + { + "epoch": 2.174786922783361, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6618319749832153, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8576873540878296, + "num_tokens": 652206735.0, + "step": 17096 + }, + { + "epoch": 2.1749141330619515, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5847599506378174, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8681192994117737, + "num_tokens": 652247060.0, + "step": 17097 + }, + { + "epoch": 2.175041343340542, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6512917280197144, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8623457551002502, + "num_tokens": 652287987.0, + "step": 17098 + }, + { + "epoch": 2.1751685536191325, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6065452098846436, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8779584765434265, + "num_tokens": 652325947.0, + "step": 17099 + }, + { + "epoch": 2.175295763897723, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6079415082931519, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8749017119407654, + "num_tokens": 652366705.0, + "step": 17100 + }, + { + "epoch": 2.1754229741763136, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8438421487808228, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8708304762840271, + "num_tokens": 652401139.0, + "step": 17101 + }, + { + "epoch": 2.175550184454904, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7114205360412598, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8725377321243286, + "num_tokens": 652438273.0, + "step": 17102 + }, + { + "epoch": 2.1756773947334946, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6290092468261719, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8743572235107422, + "num_tokens": 652480146.0, + "step": 17103 + }, + { + "epoch": 2.175804605012085, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.688593864440918, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8742331266403198, + "num_tokens": 652518871.0, + "step": 17104 + }, + { + "epoch": 2.1759318152906753, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6588141918182373, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8844356536865234, + "num_tokens": 652559602.0, + "step": 17105 + }, + { + "epoch": 2.1760590255692662, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.744884729385376, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8920778632164001, + "num_tokens": 652595115.0, + "step": 17106 + }, + { + "epoch": 2.1761862358478563, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.647220492362976, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8592607975006104, + "num_tokens": 652633926.0, + "step": 17107 + }, + { + "epoch": 2.176313446126447, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.9437252283096313, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8554521799087524, + "num_tokens": 652667092.0, + "step": 17108 + }, + { + "epoch": 2.1764406564050374, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5603210926055908, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8595377802848816, + "num_tokens": 652712002.0, + "step": 17109 + }, + { + "epoch": 2.176567866683628, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7683640718460083, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8791772127151489, + "num_tokens": 652753096.0, + "step": 17110 + }, + { + "epoch": 2.1766950769622184, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7773489952087402, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8909081816673279, + "num_tokens": 652786454.0, + "step": 17111 + }, + { + "epoch": 2.176822287240809, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.736556887626648, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8846898078918457, + "num_tokens": 652823482.0, + "step": 17112 + }, + { + "epoch": 2.1769494975193995, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.677824854850769, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8927670121192932, + "num_tokens": 652856571.0, + "step": 17113 + }, + { + "epoch": 2.17707670779799, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8622946739196777, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8618494272232056, + "num_tokens": 652890819.0, + "step": 17114 + }, + { + "epoch": 2.1772039180765805, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5991626977920532, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8786267042160034, + "num_tokens": 652929430.0, + "step": 17115 + }, + { + "epoch": 2.177331128355171, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.647027611732483, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8700249195098877, + "num_tokens": 652970747.0, + "step": 17116 + }, + { + "epoch": 2.1774583386337616, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7120381593704224, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8814787864685059, + "num_tokens": 653004711.0, + "step": 17117 + }, + { + "epoch": 2.177585548912352, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7262533903121948, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8875616192817688, + "num_tokens": 653041748.0, + "step": 17118 + }, + { + "epoch": 2.1777127591909426, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6632269620895386, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8798227310180664, + "num_tokens": 653080523.0, + "step": 17119 + }, + { + "epoch": 2.177839969469533, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.748656988143921, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8784581422805786, + "num_tokens": 653113378.0, + "step": 17120 + }, + { + "epoch": 2.1779671797481237, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.816819429397583, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8805280923843384, + "num_tokens": 653148335.0, + "step": 17121 + }, + { + "epoch": 2.1780943900267142, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5889447927474976, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8797537684440613, + "num_tokens": 653193646.0, + "step": 17122 + }, + { + "epoch": 2.1782216003053048, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6814533472061157, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.861204981803894, + "num_tokens": 653232572.0, + "step": 17123 + }, + { + "epoch": 2.1783488105838953, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.695874571800232, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.868617594242096, + "num_tokens": 653271779.0, + "step": 17124 + }, + { + "epoch": 2.178476020862486, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7264546155929565, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8732054233551025, + "num_tokens": 653312939.0, + "step": 17125 + }, + { + "epoch": 2.1786032311410763, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.617935061454773, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8830810785293579, + "num_tokens": 653352567.0, + "step": 17126 + }, + { + "epoch": 2.178730441419667, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7321009635925293, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8734256029129028, + "num_tokens": 653390823.0, + "step": 17127 + }, + { + "epoch": 2.1788576516982574, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6521058082580566, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8774886131286621, + "num_tokens": 653430220.0, + "step": 17128 + }, + { + "epoch": 2.178984861976848, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5260826349258423, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8714480400085449, + "num_tokens": 653475067.0, + "step": 17129 + }, + { + "epoch": 2.179112072255438, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7329986095428467, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.868943452835083, + "num_tokens": 653512563.0, + "step": 17130 + }, + { + "epoch": 2.1792392825340285, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7109397649765015, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.88591468334198, + "num_tokens": 653546539.0, + "step": 17131 + }, + { + "epoch": 2.179366492812619, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8486405611038208, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8831602334976196, + "num_tokens": 653586459.0, + "step": 17132 + }, + { + "epoch": 2.1794937030912096, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6098133325576782, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8782762289047241, + "num_tokens": 653623879.0, + "step": 17133 + }, + { + "epoch": 2.1796209133698, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8050776720046997, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8692363500595093, + "num_tokens": 653661828.0, + "step": 17134 + }, + { + "epoch": 2.1797481236483907, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.4969274997711182, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8931432366371155, + "num_tokens": 653702019.0, + "step": 17135 + }, + { + "epoch": 2.179875333926981, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.581799864768982, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8814272284507751, + "num_tokens": 653744073.0, + "step": 17136 + }, + { + "epoch": 2.1800025442055717, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8793251514434814, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8709274530410767, + "num_tokens": 653780964.0, + "step": 17137 + }, + { + "epoch": 2.1801297544841622, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5785496234893799, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8835382461547852, + "num_tokens": 653819040.0, + "step": 17138 + }, + { + "epoch": 2.1802569647627528, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5636111497879028, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8794980645179749, + "num_tokens": 653857630.0, + "step": 17139 + }, + { + "epoch": 2.1803841750413433, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5795351266860962, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8771971464157104, + "num_tokens": 653896236.0, + "step": 17140 + }, + { + "epoch": 2.180511385319934, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.588369607925415, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8879907131195068, + "num_tokens": 653933116.0, + "step": 17141 + }, + { + "epoch": 2.1806385955985244, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.730278491973877, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8651899099349976, + "num_tokens": 653971110.0, + "step": 17142 + }, + { + "epoch": 2.180765805877115, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.640838384628296, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8761770725250244, + "num_tokens": 654010551.0, + "step": 17143 + }, + { + "epoch": 2.1808930161557054, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5615723133087158, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8806910514831543, + "num_tokens": 654049889.0, + "step": 17144 + }, + { + "epoch": 2.181020226434296, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6429803371429443, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8607281446456909, + "num_tokens": 654087782.0, + "step": 17145 + }, + { + "epoch": 2.1811474367128865, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6104977130889893, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8716138005256653, + "num_tokens": 654128520.0, + "step": 17146 + }, + { + "epoch": 2.181274646991477, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6556541919708252, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8748133182525635, + "num_tokens": 654166739.0, + "step": 17147 + }, + { + "epoch": 2.1814018572700675, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.3292200565338135, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8913533091545105, + "num_tokens": 654212682.0, + "step": 17148 + }, + { + "epoch": 2.181529067548658, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7647206783294678, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.866412341594696, + "num_tokens": 654248648.0, + "step": 17149 + }, + { + "epoch": 2.1816562778272486, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.711252212524414, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8808256387710571, + "num_tokens": 654284421.0, + "step": 17150 + }, + { + "epoch": 2.181783488105839, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5792955160140991, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8703932762145996, + "num_tokens": 654327969.0, + "step": 17151 + }, + { + "epoch": 2.1819106983844296, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6587398052215576, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8874695301055908, + "num_tokens": 654367141.0, + "step": 17152 + }, + { + "epoch": 2.18203790866302, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6686400175094604, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8724209070205688, + "num_tokens": 654405412.0, + "step": 17153 + }, + { + "epoch": 2.1821651189416107, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7227704524993896, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8665890097618103, + "num_tokens": 654443477.0, + "step": 17154 + }, + { + "epoch": 2.1822923292202008, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7967195510864258, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8509429097175598, + "num_tokens": 654481399.0, + "step": 17155 + }, + { + "epoch": 2.1824195394987913, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.706605076789856, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.877348780632019, + "num_tokens": 654513104.0, + "step": 17156 + }, + { + "epoch": 2.182546749777382, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.778768539428711, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8696275949478149, + "num_tokens": 654547295.0, + "step": 17157 + }, + { + "epoch": 2.1826739600559724, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5788440704345703, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8899370431900024, + "num_tokens": 654582783.0, + "step": 17158 + }, + { + "epoch": 2.182801170334563, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.603749394416809, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8731114268302917, + "num_tokens": 654620555.0, + "step": 17159 + }, + { + "epoch": 2.1829283806131534, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.578470230102539, + "learning_rate": 1e-06, + "loss": 0.3257, + "mean_token_accuracy": 0.8928613662719727, + "num_tokens": 654657291.0, + "step": 17160 + }, + { + "epoch": 2.183055590891744, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5917000770568848, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.883310079574585, + "num_tokens": 654695173.0, + "step": 17161 + }, + { + "epoch": 2.1831828011703345, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5819727182388306, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8703089356422424, + "num_tokens": 654736372.0, + "step": 17162 + }, + { + "epoch": 2.183310011448925, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7799615859985352, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8701463937759399, + "num_tokens": 654772012.0, + "step": 17163 + }, + { + "epoch": 2.1834372217275155, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7391982078552246, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8627986907958984, + "num_tokens": 654812130.0, + "step": 17164 + }, + { + "epoch": 2.183564432006106, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.624027967453003, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8807368278503418, + "num_tokens": 654849282.0, + "step": 17165 + }, + { + "epoch": 2.1836916422846966, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8290069103240967, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8665561676025391, + "num_tokens": 654884788.0, + "step": 17166 + }, + { + "epoch": 2.183818852563287, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.619301438331604, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8712365031242371, + "num_tokens": 654927278.0, + "step": 17167 + }, + { + "epoch": 2.1839460628418776, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.717565894126892, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8708912134170532, + "num_tokens": 654961546.0, + "step": 17168 + }, + { + "epoch": 2.184073273120468, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.734344720840454, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8804716467857361, + "num_tokens": 654997088.0, + "step": 17169 + }, + { + "epoch": 2.1842004833990587, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.811094880104065, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8788642883300781, + "num_tokens": 655026759.0, + "step": 17170 + }, + { + "epoch": 2.1843276936776492, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7612115144729614, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8630496263504028, + "num_tokens": 655063615.0, + "step": 17171 + }, + { + "epoch": 2.1844549039562398, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.9472986459732056, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8700225353240967, + "num_tokens": 655097324.0, + "step": 17172 + }, + { + "epoch": 2.1845821142348303, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.670767903327942, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8725456595420837, + "num_tokens": 655135455.0, + "step": 17173 + }, + { + "epoch": 2.184709324513421, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.493984580039978, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8792654275894165, + "num_tokens": 655178170.0, + "step": 17174 + }, + { + "epoch": 2.1848365347920113, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.600834608078003, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8894904851913452, + "num_tokens": 655212920.0, + "step": 17175 + }, + { + "epoch": 2.184963745070602, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7355077266693115, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8648950457572937, + "num_tokens": 655252652.0, + "step": 17176 + }, + { + "epoch": 2.1850909553491924, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.560865879058838, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8721464276313782, + "num_tokens": 655292699.0, + "step": 17177 + }, + { + "epoch": 2.185218165627783, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8038771152496338, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8742746710777283, + "num_tokens": 655330599.0, + "step": 17178 + }, + { + "epoch": 2.1853453759063735, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7434390783309937, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8708615303039551, + "num_tokens": 655367770.0, + "step": 17179 + }, + { + "epoch": 2.1854725861849635, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.660654902458191, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8725484609603882, + "num_tokens": 655410152.0, + "step": 17180 + }, + { + "epoch": 2.185599796463554, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7945923805236816, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8725222945213318, + "num_tokens": 655445825.0, + "step": 17181 + }, + { + "epoch": 2.1857270067421446, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6977967023849487, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8671048283576965, + "num_tokens": 655488081.0, + "step": 17182 + }, + { + "epoch": 2.185854217020735, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6333210468292236, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.883065938949585, + "num_tokens": 655527678.0, + "step": 17183 + }, + { + "epoch": 2.1859814272993257, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.708549976348877, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8694189786911011, + "num_tokens": 655572656.0, + "step": 17184 + }, + { + "epoch": 2.186108637577916, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.82976496219635, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.878510594367981, + "num_tokens": 655608268.0, + "step": 17185 + }, + { + "epoch": 2.1862358478565067, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7333120107650757, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8905943632125854, + "num_tokens": 655644525.0, + "step": 17186 + }, + { + "epoch": 2.1863630581350972, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5916324853897095, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8686296939849854, + "num_tokens": 655684871.0, + "step": 17187 + }, + { + "epoch": 2.1864902684136878, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7141296863555908, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8743189573287964, + "num_tokens": 655720342.0, + "step": 17188 + }, + { + "epoch": 2.1866174786922783, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.4712294340133667, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.884722888469696, + "num_tokens": 655762966.0, + "step": 17189 + }, + { + "epoch": 2.186744688970869, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5995241403579712, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8659850358963013, + "num_tokens": 655805543.0, + "step": 17190 + }, + { + "epoch": 2.1868718992494594, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5790936946868896, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8894662261009216, + "num_tokens": 655844694.0, + "step": 17191 + }, + { + "epoch": 2.18699910952805, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6441963911056519, + "learning_rate": 1e-06, + "loss": 0.3172, + "mean_token_accuracy": 0.8963375687599182, + "num_tokens": 655878490.0, + "step": 17192 + }, + { + "epoch": 2.1871263198066404, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5842427015304565, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8860951066017151, + "num_tokens": 655917069.0, + "step": 17193 + }, + { + "epoch": 2.187253530085231, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5976033210754395, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8815346956253052, + "num_tokens": 655958066.0, + "step": 17194 + }, + { + "epoch": 2.1873807403638215, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.75343656539917, + "learning_rate": 1e-06, + "loss": 0.3413, + "mean_token_accuracy": 0.8889398574829102, + "num_tokens": 655991916.0, + "step": 17195 + }, + { + "epoch": 2.187507950642412, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.589250087738037, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8779059648513794, + "num_tokens": 656030208.0, + "step": 17196 + }, + { + "epoch": 2.1876351609210025, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6675282716751099, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8825198411941528, + "num_tokens": 656070443.0, + "step": 17197 + }, + { + "epoch": 2.187762371199593, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7373220920562744, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8710694313049316, + "num_tokens": 656107765.0, + "step": 17198 + }, + { + "epoch": 2.1878895814781836, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7162634134292603, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8631150722503662, + "num_tokens": 656147830.0, + "step": 17199 + }, + { + "epoch": 2.188016791756774, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.48183012008667, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8808334469795227, + "num_tokens": 656189028.0, + "step": 17200 + }, + { + "epoch": 2.1881440020353646, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6066714525222778, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8769887685775757, + "num_tokens": 656227934.0, + "step": 17201 + }, + { + "epoch": 2.188271212313955, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8169552087783813, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.870631217956543, + "num_tokens": 656262984.0, + "step": 17202 + }, + { + "epoch": 2.1883984225925452, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 3.976485252380371, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8612355589866638, + "num_tokens": 656296944.0, + "step": 17203 + }, + { + "epoch": 2.188525632871136, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5526368618011475, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8710384368896484, + "num_tokens": 656340115.0, + "step": 17204 + }, + { + "epoch": 2.1886528431497263, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7696171998977661, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8751981258392334, + "num_tokens": 656375620.0, + "step": 17205 + }, + { + "epoch": 2.188780053428317, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7324131727218628, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8733352422714233, + "num_tokens": 656409290.0, + "step": 17206 + }, + { + "epoch": 2.1889072637069074, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6107622385025024, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8765278458595276, + "num_tokens": 656444515.0, + "step": 17207 + }, + { + "epoch": 2.189034473985498, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.619719386100769, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8762982487678528, + "num_tokens": 656487638.0, + "step": 17208 + }, + { + "epoch": 2.1891616842640884, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6648355722427368, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8774493336677551, + "num_tokens": 656523510.0, + "step": 17209 + }, + { + "epoch": 2.189288894542679, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5621989965438843, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8819774389266968, + "num_tokens": 656564143.0, + "step": 17210 + }, + { + "epoch": 2.1894161048212695, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7160141468048096, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8774518966674805, + "num_tokens": 656598285.0, + "step": 17211 + }, + { + "epoch": 2.18954331509986, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6256204843521118, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8754168748855591, + "num_tokens": 656641531.0, + "step": 17212 + }, + { + "epoch": 2.1896705253784505, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6810400485992432, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8847119808197021, + "num_tokens": 656681782.0, + "step": 17213 + }, + { + "epoch": 2.189797735657041, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.748826026916504, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8815320730209351, + "num_tokens": 656717935.0, + "step": 17214 + }, + { + "epoch": 2.1899249459356316, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6315454244613647, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.869372546672821, + "num_tokens": 656757538.0, + "step": 17215 + }, + { + "epoch": 2.190052156214222, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7060638666152954, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8614193201065063, + "num_tokens": 656795427.0, + "step": 17216 + }, + { + "epoch": 2.1901793664928126, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6175509691238403, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8707329034805298, + "num_tokens": 656838754.0, + "step": 17217 + }, + { + "epoch": 2.190306576771403, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.848986029624939, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.859633207321167, + "num_tokens": 656875014.0, + "step": 17218 + }, + { + "epoch": 2.1904337870499937, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6438591480255127, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8759443759918213, + "num_tokens": 656914064.0, + "step": 17219 + }, + { + "epoch": 2.1905609973285842, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 2.0444722175598145, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8656141757965088, + "num_tokens": 656947640.0, + "step": 17220 + }, + { + "epoch": 2.1906882076071748, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6715936660766602, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8640020489692688, + "num_tokens": 656985203.0, + "step": 17221 + }, + { + "epoch": 2.1908154178857653, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.657657265663147, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8728196620941162, + "num_tokens": 657023421.0, + "step": 17222 + }, + { + "epoch": 2.190942628164356, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.4792524576187134, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8784708976745605, + "num_tokens": 657067094.0, + "step": 17223 + }, + { + "epoch": 2.1910698384429463, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5039968490600586, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8844575881958008, + "num_tokens": 657110404.0, + "step": 17224 + }, + { + "epoch": 2.191197048721537, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7104111909866333, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8610037565231323, + "num_tokens": 657147949.0, + "step": 17225 + }, + { + "epoch": 2.1913242590001274, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.7127538919448853, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8778579831123352, + "num_tokens": 657180063.0, + "step": 17226 + }, + { + "epoch": 2.191451469278718, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.5924891233444214, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8842149972915649, + "num_tokens": 657218577.0, + "step": 17227 + }, + { + "epoch": 2.191578679557308, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.6528109312057495, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8802340030670166, + "num_tokens": 657258860.0, + "step": 17228 + }, + { + "epoch": 2.1917058898358985, + "ewc_loss": 2.6941299438476562e-05, + "grad_norm": 1.8071640729904175, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8618574738502502, + "num_tokens": 657294902.0, + "step": 17229 + }, + { + "epoch": 2.191833100114489, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6583036184310913, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.872650146484375, + "num_tokens": 657331931.0, + "step": 17230 + }, + { + "epoch": 2.1919603103930796, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.7020293474197388, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8690977096557617, + "num_tokens": 657367692.0, + "step": 17231 + }, + { + "epoch": 2.19208752067167, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.7391436100006104, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8543747663497925, + "num_tokens": 657409785.0, + "step": 17232 + }, + { + "epoch": 2.1922147309502606, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6934819221496582, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8650068640708923, + "num_tokens": 657444435.0, + "step": 17233 + }, + { + "epoch": 2.192341941228851, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.616115689277649, + "learning_rate": 1e-06, + "loss": 0.3276, + "mean_token_accuracy": 0.8927633762359619, + "num_tokens": 657480054.0, + "step": 17234 + }, + { + "epoch": 2.1924691515074417, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8028672933578491, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8755065202713013, + "num_tokens": 657512814.0, + "step": 17235 + }, + { + "epoch": 2.1925963617860322, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.605983018875122, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8808718919754028, + "num_tokens": 657550734.0, + "step": 17236 + }, + { + "epoch": 2.1927235720646228, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.58716881275177, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8831303119659424, + "num_tokens": 657587855.0, + "step": 17237 + }, + { + "epoch": 2.1928507823432133, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.4975128173828125, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8837356567382812, + "num_tokens": 657631080.0, + "step": 17238 + }, + { + "epoch": 2.192977992621804, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.734119176864624, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8661927580833435, + "num_tokens": 657670092.0, + "step": 17239 + }, + { + "epoch": 2.1931052029003943, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.5830509662628174, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.879170835018158, + "num_tokens": 657708224.0, + "step": 17240 + }, + { + "epoch": 2.193232413178985, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.5577380657196045, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.889400839805603, + "num_tokens": 657748626.0, + "step": 17241 + }, + { + "epoch": 2.1933596234575754, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.75722336769104, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8671808838844299, + "num_tokens": 657782603.0, + "step": 17242 + }, + { + "epoch": 2.193486833736166, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.590121865272522, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8942274451255798, + "num_tokens": 657815950.0, + "step": 17243 + }, + { + "epoch": 2.1936140440147565, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.7718981504440308, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8585048913955688, + "num_tokens": 657853353.0, + "step": 17244 + }, + { + "epoch": 2.193741254293347, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.66849684715271, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8797810673713684, + "num_tokens": 657891494.0, + "step": 17245 + }, + { + "epoch": 2.1938684645719375, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.8925080299377441, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8637865781784058, + "num_tokens": 657924851.0, + "step": 17246 + }, + { + "epoch": 2.193995674850528, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.5708906650543213, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8627792596817017, + "num_tokens": 657968896.0, + "step": 17247 + }, + { + "epoch": 2.1941228851291186, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6315609216690063, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8867877721786499, + "num_tokens": 658006297.0, + "step": 17248 + }, + { + "epoch": 2.194250095407709, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6907527446746826, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8796204328536987, + "num_tokens": 658041978.0, + "step": 17249 + }, + { + "epoch": 2.1943773056862996, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.7000858783721924, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8717561960220337, + "num_tokens": 658079985.0, + "step": 17250 + }, + { + "epoch": 2.19450451596489, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6277505159378052, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8727493286132812, + "num_tokens": 658119410.0, + "step": 17251 + }, + { + "epoch": 2.1946317262434807, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.7485957145690918, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8497648239135742, + "num_tokens": 658159692.0, + "step": 17252 + }, + { + "epoch": 2.1947589365220708, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.9041095972061157, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8710314035415649, + "num_tokens": 658193742.0, + "step": 17253 + }, + { + "epoch": 2.1948861468006613, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.64602530002594, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8884813785552979, + "num_tokens": 658231407.0, + "step": 17254 + }, + { + "epoch": 2.195013357079252, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.5975444316864014, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8654797077178955, + "num_tokens": 658271364.0, + "step": 17255 + }, + { + "epoch": 2.1951405673578424, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.731851577758789, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8864779472351074, + "num_tokens": 658304479.0, + "step": 17256 + }, + { + "epoch": 2.195267777636433, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.8304070234298706, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.880305290222168, + "num_tokens": 658338519.0, + "step": 17257 + }, + { + "epoch": 2.1953949879150234, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6414682865142822, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8686990737915039, + "num_tokens": 658375325.0, + "step": 17258 + }, + { + "epoch": 2.195522198193614, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6070170402526855, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8784160017967224, + "num_tokens": 658416273.0, + "step": 17259 + }, + { + "epoch": 2.1956494084722045, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.5902007818222046, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8736539483070374, + "num_tokens": 658458320.0, + "step": 17260 + }, + { + "epoch": 2.195776618750795, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.5814520120620728, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8841339349746704, + "num_tokens": 658496232.0, + "step": 17261 + }, + { + "epoch": 2.1959038290293855, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.7349035739898682, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8661546111106873, + "num_tokens": 658532209.0, + "step": 17262 + }, + { + "epoch": 2.196031039307976, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.7265583276748657, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8674980401992798, + "num_tokens": 658568782.0, + "step": 17263 + }, + { + "epoch": 2.1961582495865666, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.6781543493270874, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8714683055877686, + "num_tokens": 658613385.0, + "step": 17264 + }, + { + "epoch": 2.196285459865157, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.8431694507598877, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.859879732131958, + "num_tokens": 658645647.0, + "step": 17265 + }, + { + "epoch": 2.1964126701437476, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.590041995048523, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8799426555633545, + "num_tokens": 658687328.0, + "step": 17266 + }, + { + "epoch": 2.196539880422338, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.588813066482544, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8722081184387207, + "num_tokens": 658727357.0, + "step": 17267 + }, + { + "epoch": 2.1966670907009287, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.9301164150238037, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8740404844284058, + "num_tokens": 658762669.0, + "step": 17268 + }, + { + "epoch": 2.196794300979519, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.5552303791046143, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8764863014221191, + "num_tokens": 658803436.0, + "step": 17269 + }, + { + "epoch": 2.1969215112581097, + "ewc_loss": 2.7060508728027344e-05, + "grad_norm": 1.8580317497253418, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8678818345069885, + "num_tokens": 658834720.0, + "step": 17270 + }, + { + "epoch": 2.1970487215367003, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6326273679733276, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8761008977890015, + "num_tokens": 658877242.0, + "step": 17271 + }, + { + "epoch": 2.197175931815291, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5914945602416992, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8755488395690918, + "num_tokens": 658919014.0, + "step": 17272 + }, + { + "epoch": 2.1973031420938813, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7154959440231323, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8686580657958984, + "num_tokens": 658953938.0, + "step": 17273 + }, + { + "epoch": 2.197430352372472, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6144942045211792, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8767153024673462, + "num_tokens": 658991543.0, + "step": 17274 + }, + { + "epoch": 2.1975575626510624, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6669098138809204, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8798215985298157, + "num_tokens": 659032390.0, + "step": 17275 + }, + { + "epoch": 2.197684772929653, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7473981380462646, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8796871900558472, + "num_tokens": 659067375.0, + "step": 17276 + }, + { + "epoch": 2.1978119832082434, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6832919120788574, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8736528158187866, + "num_tokens": 659102419.0, + "step": 17277 + }, + { + "epoch": 2.1979391934868335, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6177659034729004, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8755024671554565, + "num_tokens": 659139500.0, + "step": 17278 + }, + { + "epoch": 2.198066403765424, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.9404734373092651, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8729184865951538, + "num_tokens": 659170548.0, + "step": 17279 + }, + { + "epoch": 2.1981936140440146, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7201026678085327, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8643638491630554, + "num_tokens": 659208923.0, + "step": 17280 + }, + { + "epoch": 2.198320824322605, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7772012948989868, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8661181926727295, + "num_tokens": 659243412.0, + "step": 17281 + }, + { + "epoch": 2.1984480346011956, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7541413307189941, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8790376782417297, + "num_tokens": 659280171.0, + "step": 17282 + }, + { + "epoch": 2.198575244879786, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5951231718063354, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8714022040367126, + "num_tokens": 659323022.0, + "step": 17283 + }, + { + "epoch": 2.1987024551583767, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7085438966751099, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8603776693344116, + "num_tokens": 659359630.0, + "step": 17284 + }, + { + "epoch": 2.1988296654369672, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.542917251586914, + "learning_rate": 1e-06, + "loss": 0.3187, + "mean_token_accuracy": 0.8964238166809082, + "num_tokens": 659399302.0, + "step": 17285 + }, + { + "epoch": 2.1989568757155578, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.747889757156372, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8827997446060181, + "num_tokens": 659430448.0, + "step": 17286 + }, + { + "epoch": 2.1990840859941483, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5164815187454224, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8810193538665771, + "num_tokens": 659474504.0, + "step": 17287 + }, + { + "epoch": 2.199211296272739, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.665555477142334, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8828616142272949, + "num_tokens": 659510225.0, + "step": 17288 + }, + { + "epoch": 2.1993385065513293, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6425267457962036, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8603345155715942, + "num_tokens": 659551872.0, + "step": 17289 + }, + { + "epoch": 2.19946571682992, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7543349266052246, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8762613534927368, + "num_tokens": 659585540.0, + "step": 17290 + }, + { + "epoch": 2.1995929271085104, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6025127172470093, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8662662506103516, + "num_tokens": 659627344.0, + "step": 17291 + }, + { + "epoch": 2.199720137387101, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6941823959350586, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8618590831756592, + "num_tokens": 659664613.0, + "step": 17292 + }, + { + "epoch": 2.1998473476656915, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5344058275222778, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8898500800132751, + "num_tokens": 659705711.0, + "step": 17293 + }, + { + "epoch": 2.199974557944282, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5982253551483154, + "learning_rate": 1e-06, + "loss": 0.34, + "mean_token_accuracy": 0.8875901699066162, + "num_tokens": 659741854.0, + "step": 17294 + }, + { + "epoch": 2.2001017682228725, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7132073640823364, + "learning_rate": 1e-06, + "loss": 0.3079, + "mean_token_accuracy": 0.8981042504310608, + "num_tokens": 659774922.0, + "step": 17295 + }, + { + "epoch": 2.200228978501463, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6650168895721436, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8852227926254272, + "num_tokens": 659814051.0, + "step": 17296 + }, + { + "epoch": 2.2003561887800536, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8917344808578491, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8696326017379761, + "num_tokens": 659851520.0, + "step": 17297 + }, + { + "epoch": 2.200483399058644, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7675342559814453, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.880156397819519, + "num_tokens": 659886732.0, + "step": 17298 + }, + { + "epoch": 2.2006106093372346, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6089459657669067, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.864397406578064, + "num_tokens": 659926774.0, + "step": 17299 + }, + { + "epoch": 2.200737819615825, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5548970699310303, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8827683329582214, + "num_tokens": 659969997.0, + "step": 17300 + }, + { + "epoch": 2.2008650298944152, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.72406005859375, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8640410304069519, + "num_tokens": 660008138.0, + "step": 17301 + }, + { + "epoch": 2.200992240173006, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5018075704574585, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8863617181777954, + "num_tokens": 660052034.0, + "step": 17302 + }, + { + "epoch": 2.2011194504515963, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.723395824432373, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8735575675964355, + "num_tokens": 660084052.0, + "step": 17303 + }, + { + "epoch": 2.201246660730187, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6316468715667725, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8664214611053467, + "num_tokens": 660124689.0, + "step": 17304 + }, + { + "epoch": 2.2013738710087773, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5665949583053589, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8719640374183655, + "num_tokens": 660168395.0, + "step": 17305 + }, + { + "epoch": 2.201501081287368, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5540870428085327, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8683526515960693, + "num_tokens": 660211552.0, + "step": 17306 + }, + { + "epoch": 2.2016282915659584, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.572577714920044, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8777106404304504, + "num_tokens": 660252034.0, + "step": 17307 + }, + { + "epoch": 2.201755501844549, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.599826693534851, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8804474472999573, + "num_tokens": 660291431.0, + "step": 17308 + }, + { + "epoch": 2.2018827121231395, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6010628938674927, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8813555240631104, + "num_tokens": 660333243.0, + "step": 17309 + }, + { + "epoch": 2.20200992240173, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5812281370162964, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8889353275299072, + "num_tokens": 660369298.0, + "step": 17310 + }, + { + "epoch": 2.2021371326803205, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7415093183517456, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8904479742050171, + "num_tokens": 660405642.0, + "step": 17311 + }, + { + "epoch": 2.202264342958911, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.785425066947937, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8829760551452637, + "num_tokens": 660445224.0, + "step": 17312 + }, + { + "epoch": 2.2023915532375016, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5153552293777466, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8774529695510864, + "num_tokens": 660487258.0, + "step": 17313 + }, + { + "epoch": 2.202518763516092, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.59465491771698, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8814730048179626, + "num_tokens": 660526699.0, + "step": 17314 + }, + { + "epoch": 2.2026459737946826, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.704417109489441, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8685556650161743, + "num_tokens": 660563089.0, + "step": 17315 + }, + { + "epoch": 2.202773184073273, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.9591093063354492, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8769543766975403, + "num_tokens": 660596012.0, + "step": 17316 + }, + { + "epoch": 2.2029003943518637, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6684507131576538, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8821755051612854, + "num_tokens": 660635985.0, + "step": 17317 + }, + { + "epoch": 2.203027604630454, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5908275842666626, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8765743374824524, + "num_tokens": 660673802.0, + "step": 17318 + }, + { + "epoch": 2.2031548149090447, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.607435941696167, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8766219615936279, + "num_tokens": 660711790.0, + "step": 17319 + }, + { + "epoch": 2.2032820251876353, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6034046411514282, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.880217432975769, + "num_tokens": 660750614.0, + "step": 17320 + }, + { + "epoch": 2.203409235466226, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.558542013168335, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8890302181243896, + "num_tokens": 660788615.0, + "step": 17321 + }, + { + "epoch": 2.2035364457448163, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.748857855796814, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8860286474227905, + "num_tokens": 660823441.0, + "step": 17322 + }, + { + "epoch": 2.203663656023407, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6614680290222168, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8837904930114746, + "num_tokens": 660863157.0, + "step": 17323 + }, + { + "epoch": 2.2037908663019974, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6292551755905151, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8734047412872314, + "num_tokens": 660903338.0, + "step": 17324 + }, + { + "epoch": 2.203918076580588, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6466456651687622, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8774352073669434, + "num_tokens": 660940244.0, + "step": 17325 + }, + { + "epoch": 2.204045286859178, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6553218364715576, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8776780962944031, + "num_tokens": 660980654.0, + "step": 17326 + }, + { + "epoch": 2.2041724971377685, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8045567274093628, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8720662593841553, + "num_tokens": 661017580.0, + "step": 17327 + }, + { + "epoch": 2.204299707416359, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6324917078018188, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8783884048461914, + "num_tokens": 661055873.0, + "step": 17328 + }, + { + "epoch": 2.2044269176949496, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6709131002426147, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.873510479927063, + "num_tokens": 661095388.0, + "step": 17329 + }, + { + "epoch": 2.20455412797354, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6284868717193604, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8825857043266296, + "num_tokens": 661136442.0, + "step": 17330 + }, + { + "epoch": 2.2046813382521306, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6783133745193481, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8747850656509399, + "num_tokens": 661174484.0, + "step": 17331 + }, + { + "epoch": 2.204808548530721, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8250154256820679, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8740788102149963, + "num_tokens": 661207425.0, + "step": 17332 + }, + { + "epoch": 2.2049357588093117, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8227471113204956, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8485071063041687, + "num_tokens": 661243022.0, + "step": 17333 + }, + { + "epoch": 2.2050629690879022, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7404029369354248, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8748082518577576, + "num_tokens": 661277770.0, + "step": 17334 + }, + { + "epoch": 2.2051901793664928, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7241445779800415, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8801828622817993, + "num_tokens": 661312512.0, + "step": 17335 + }, + { + "epoch": 2.2053173896450833, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6316031217575073, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8748833537101746, + "num_tokens": 661350432.0, + "step": 17336 + }, + { + "epoch": 2.205444599923674, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.791961431503296, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8507431745529175, + "num_tokens": 661388529.0, + "step": 17337 + }, + { + "epoch": 2.2055718102022643, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.693602442741394, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8863654732704163, + "num_tokens": 661422952.0, + "step": 17338 + }, + { + "epoch": 2.205699020480855, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6657081842422485, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8855350017547607, + "num_tokens": 661457424.0, + "step": 17339 + }, + { + "epoch": 2.2058262307594454, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6022744178771973, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8804998397827148, + "num_tokens": 661498566.0, + "step": 17340 + }, + { + "epoch": 2.205953441038036, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6525757312774658, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8829888105392456, + "num_tokens": 661536564.0, + "step": 17341 + }, + { + "epoch": 2.2060806513166265, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8415569067001343, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8655753135681152, + "num_tokens": 661572147.0, + "step": 17342 + }, + { + "epoch": 2.206207861595217, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6773593425750732, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8806358575820923, + "num_tokens": 661610747.0, + "step": 17343 + }, + { + "epoch": 2.2063350718738075, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6879942417144775, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8823846578598022, + "num_tokens": 661647262.0, + "step": 17344 + }, + { + "epoch": 2.206462282152398, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.831955075263977, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8670018911361694, + "num_tokens": 661679119.0, + "step": 17345 + }, + { + "epoch": 2.2065894924309886, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.724966049194336, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8877844214439392, + "num_tokens": 661712615.0, + "step": 17346 + }, + { + "epoch": 2.206716702709579, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7385278940200806, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8557335138320923, + "num_tokens": 661749467.0, + "step": 17347 + }, + { + "epoch": 2.2068439129881696, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7831517457962036, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.873013973236084, + "num_tokens": 661782762.0, + "step": 17348 + }, + { + "epoch": 2.20697112326676, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5305769443511963, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8778328895568848, + "num_tokens": 661823270.0, + "step": 17349 + }, + { + "epoch": 2.2070983335453507, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.531432032585144, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8864072561264038, + "num_tokens": 661862877.0, + "step": 17350 + }, + { + "epoch": 2.2072255438239408, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5615993738174438, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8763187527656555, + "num_tokens": 661903964.0, + "step": 17351 + }, + { + "epoch": 2.2073527541025313, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7445768117904663, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8626893758773804, + "num_tokens": 661938749.0, + "step": 17352 + }, + { + "epoch": 2.207479964381122, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8280214071273804, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8745046257972717, + "num_tokens": 661968897.0, + "step": 17353 + }, + { + "epoch": 2.2076071746597123, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6144654750823975, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8732104301452637, + "num_tokens": 662008514.0, + "step": 17354 + }, + { + "epoch": 2.207734384938303, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5464422702789307, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8765052556991577, + "num_tokens": 662047977.0, + "step": 17355 + }, + { + "epoch": 2.2078615952168934, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.74009108543396, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8735173344612122, + "num_tokens": 662085883.0, + "step": 17356 + }, + { + "epoch": 2.207988805495484, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.4702826738357544, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.890709638595581, + "num_tokens": 662126943.0, + "step": 17357 + }, + { + "epoch": 2.2081160157740745, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.47976815700531, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8847787976264954, + "num_tokens": 662171493.0, + "step": 17358 + }, + { + "epoch": 2.208243226052665, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7709401845932007, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8689591884613037, + "num_tokens": 662208454.0, + "step": 17359 + }, + { + "epoch": 2.2083704363312555, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.587895393371582, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8870063424110413, + "num_tokens": 662245430.0, + "step": 17360 + }, + { + "epoch": 2.208497646609846, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7177729606628418, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8688713908195496, + "num_tokens": 662285557.0, + "step": 17361 + }, + { + "epoch": 2.2086248568884366, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6776129007339478, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8810747861862183, + "num_tokens": 662321702.0, + "step": 17362 + }, + { + "epoch": 2.208752067167027, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8440725803375244, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8715201616287231, + "num_tokens": 662357406.0, + "step": 17363 + }, + { + "epoch": 2.2088792774456176, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7492650747299194, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8826141357421875, + "num_tokens": 662391362.0, + "step": 17364 + }, + { + "epoch": 2.209006487724208, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7744027376174927, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8584343194961548, + "num_tokens": 662430995.0, + "step": 17365 + }, + { + "epoch": 2.2091336980027987, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5922032594680786, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8795128464698792, + "num_tokens": 662472508.0, + "step": 17366 + }, + { + "epoch": 2.209260908281389, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5896508693695068, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8693827390670776, + "num_tokens": 662515498.0, + "step": 17367 + }, + { + "epoch": 2.2093881185599797, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.684126853942871, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8756598830223083, + "num_tokens": 662555359.0, + "step": 17368 + }, + { + "epoch": 2.2095153288385703, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.52785062789917, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8902423977851868, + "num_tokens": 662594203.0, + "step": 17369 + }, + { + "epoch": 2.209642539117161, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7251067161560059, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8736007809638977, + "num_tokens": 662631823.0, + "step": 17370 + }, + { + "epoch": 2.2097697493957513, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8412799835205078, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8790373802185059, + "num_tokens": 662669492.0, + "step": 17371 + }, + { + "epoch": 2.209896959674342, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7381603717803955, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8680083751678467, + "num_tokens": 662706532.0, + "step": 17372 + }, + { + "epoch": 2.2100241699529324, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7714648246765137, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8634052276611328, + "num_tokens": 662742407.0, + "step": 17373 + }, + { + "epoch": 2.210151380231523, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6965471506118774, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8775983452796936, + "num_tokens": 662778206.0, + "step": 17374 + }, + { + "epoch": 2.2102785905101134, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.782172441482544, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8705376982688904, + "num_tokens": 662813390.0, + "step": 17375 + }, + { + "epoch": 2.2104058007887035, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6514036655426025, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8849002718925476, + "num_tokens": 662847364.0, + "step": 17376 + }, + { + "epoch": 2.210533011067294, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.570917010307312, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8784974217414856, + "num_tokens": 662888814.0, + "step": 17377 + }, + { + "epoch": 2.2106602213458846, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7351042032241821, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8688870072364807, + "num_tokens": 662922335.0, + "step": 17378 + }, + { + "epoch": 2.210787431624475, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5651037693023682, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8809136152267456, + "num_tokens": 662966149.0, + "step": 17379 + }, + { + "epoch": 2.2109146419030656, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.696336269378662, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8818464279174805, + "num_tokens": 663000375.0, + "step": 17380 + }, + { + "epoch": 2.211041852181656, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7547494173049927, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8766871094703674, + "num_tokens": 663034146.0, + "step": 17381 + }, + { + "epoch": 2.2111690624602467, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6913331747055054, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8552635312080383, + "num_tokens": 663075281.0, + "step": 17382 + }, + { + "epoch": 2.211296272738837, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6021077632904053, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8876050114631653, + "num_tokens": 663111139.0, + "step": 17383 + }, + { + "epoch": 2.2114234830174277, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5372073650360107, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8829333782196045, + "num_tokens": 663149872.0, + "step": 17384 + }, + { + "epoch": 2.2115506932960183, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8189046382904053, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8785673379898071, + "num_tokens": 663186221.0, + "step": 17385 + }, + { + "epoch": 2.211677903574609, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6439493894577026, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.881860613822937, + "num_tokens": 663221407.0, + "step": 17386 + }, + { + "epoch": 2.2118051138531993, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7176823616027832, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8735207319259644, + "num_tokens": 663260355.0, + "step": 17387 + }, + { + "epoch": 2.21193232413179, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5822250843048096, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8827319145202637, + "num_tokens": 663299766.0, + "step": 17388 + }, + { + "epoch": 2.2120595344103804, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6198012828826904, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8596891760826111, + "num_tokens": 663342605.0, + "step": 17389 + }, + { + "epoch": 2.212186744688971, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5661133527755737, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8765586018562317, + "num_tokens": 663381714.0, + "step": 17390 + }, + { + "epoch": 2.2123139549675614, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5347204208374023, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8921340703964233, + "num_tokens": 663420408.0, + "step": 17391 + }, + { + "epoch": 2.212441165246152, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.664214015007019, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8771623373031616, + "num_tokens": 663457023.0, + "step": 17392 + }, + { + "epoch": 2.2125683755247425, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5305426120758057, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8727359771728516, + "num_tokens": 663498397.0, + "step": 17393 + }, + { + "epoch": 2.212695585803333, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.707999348640442, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8774985074996948, + "num_tokens": 663530445.0, + "step": 17394 + }, + { + "epoch": 2.2128227960819236, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6453368663787842, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8841423988342285, + "num_tokens": 663566195.0, + "step": 17395 + }, + { + "epoch": 2.212950006360514, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.696113109588623, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.881806492805481, + "num_tokens": 663601930.0, + "step": 17396 + }, + { + "epoch": 2.2130772166391046, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6328715085983276, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8612106442451477, + "num_tokens": 663642554.0, + "step": 17397 + }, + { + "epoch": 2.213204426917695, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5985990762710571, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8784523010253906, + "num_tokens": 663680516.0, + "step": 17398 + }, + { + "epoch": 2.2133316371962852, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6952142715454102, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8846684694290161, + "num_tokens": 663713916.0, + "step": 17399 + }, + { + "epoch": 2.213458847474876, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5929605960845947, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8813751935958862, + "num_tokens": 663754273.0, + "step": 17400 + }, + { + "epoch": 2.2135860577534663, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6025644540786743, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8680719137191772, + "num_tokens": 663799139.0, + "step": 17401 + }, + { + "epoch": 2.213713268032057, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7382683753967285, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8842442035675049, + "num_tokens": 663833244.0, + "step": 17402 + }, + { + "epoch": 2.2138404783106473, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6983212232589722, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.866517186164856, + "num_tokens": 663871322.0, + "step": 17403 + }, + { + "epoch": 2.213967688589238, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5761162042617798, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8774465918540955, + "num_tokens": 663912595.0, + "step": 17404 + }, + { + "epoch": 2.2140948988678284, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7735679149627686, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8784545063972473, + "num_tokens": 663946436.0, + "step": 17405 + }, + { + "epoch": 2.214222109146419, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7455024719238281, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8754764795303345, + "num_tokens": 663983058.0, + "step": 17406 + }, + { + "epoch": 2.2143493194250095, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7974894046783447, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8710470199584961, + "num_tokens": 664022940.0, + "step": 17407 + }, + { + "epoch": 2.2144765297036, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7353923320770264, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8460486531257629, + "num_tokens": 664059104.0, + "step": 17408 + }, + { + "epoch": 2.2146037399821905, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5405735969543457, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8809950351715088, + "num_tokens": 664099051.0, + "step": 17409 + }, + { + "epoch": 2.214730950260781, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8384073972702026, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8508045673370361, + "num_tokens": 664136192.0, + "step": 17410 + }, + { + "epoch": 2.2148581605393716, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.600022315979004, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.892910897731781, + "num_tokens": 664173184.0, + "step": 17411 + }, + { + "epoch": 2.214985370817962, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7842758893966675, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8570876717567444, + "num_tokens": 664207129.0, + "step": 17412 + }, + { + "epoch": 2.2151125810965526, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7163740396499634, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.882346510887146, + "num_tokens": 664241478.0, + "step": 17413 + }, + { + "epoch": 2.215239791375143, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7497590780258179, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8666383028030396, + "num_tokens": 664279790.0, + "step": 17414 + }, + { + "epoch": 2.2153670016537337, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6330333948135376, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8731716871261597, + "num_tokens": 664318606.0, + "step": 17415 + }, + { + "epoch": 2.215494211932324, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6029404401779175, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8790513873100281, + "num_tokens": 664355518.0, + "step": 17416 + }, + { + "epoch": 2.2156214222109147, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5944194793701172, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.877775251865387, + "num_tokens": 664396745.0, + "step": 17417 + }, + { + "epoch": 2.2157486324895053, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6473366022109985, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8728954792022705, + "num_tokens": 664440685.0, + "step": 17418 + }, + { + "epoch": 2.215875842768096, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7193872928619385, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8738108277320862, + "num_tokens": 664477015.0, + "step": 17419 + }, + { + "epoch": 2.2160030530466863, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.4915649890899658, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8833877444267273, + "num_tokens": 664522128.0, + "step": 17420 + }, + { + "epoch": 2.216130263325277, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6277488470077515, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.88075852394104, + "num_tokens": 664562535.0, + "step": 17421 + }, + { + "epoch": 2.2162574736038674, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8331317901611328, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8609023094177246, + "num_tokens": 664593945.0, + "step": 17422 + }, + { + "epoch": 2.216384683882458, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8305959701538086, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8797142505645752, + "num_tokens": 664625191.0, + "step": 17423 + }, + { + "epoch": 2.216511894161048, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7077926397323608, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8766292929649353, + "num_tokens": 664662637.0, + "step": 17424 + }, + { + "epoch": 2.2166391044396385, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6804747581481934, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8775629997253418, + "num_tokens": 664697212.0, + "step": 17425 + }, + { + "epoch": 2.216766314718229, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8058686256408691, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.854096531867981, + "num_tokens": 664735021.0, + "step": 17426 + }, + { + "epoch": 2.2168935249968196, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6719902753829956, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8743181228637695, + "num_tokens": 664769212.0, + "step": 17427 + }, + { + "epoch": 2.21702073527541, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.628808856010437, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8799420595169067, + "num_tokens": 664809211.0, + "step": 17428 + }, + { + "epoch": 2.2171479455540006, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7937811613082886, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8712468147277832, + "num_tokens": 664841788.0, + "step": 17429 + }, + { + "epoch": 2.217275155832591, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.762848138809204, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8789986371994019, + "num_tokens": 664876334.0, + "step": 17430 + }, + { + "epoch": 2.2174023661111817, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6683729887008667, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8788799047470093, + "num_tokens": 664913720.0, + "step": 17431 + }, + { + "epoch": 2.217529576389772, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7300069332122803, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8641723394393921, + "num_tokens": 664951595.0, + "step": 17432 + }, + { + "epoch": 2.2176567866683627, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.589111566543579, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8844352960586548, + "num_tokens": 664991823.0, + "step": 17433 + }, + { + "epoch": 2.2177839969469533, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6031278371810913, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8770953416824341, + "num_tokens": 665032287.0, + "step": 17434 + }, + { + "epoch": 2.217911207225544, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.4526869058609009, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8759126663208008, + "num_tokens": 665078174.0, + "step": 17435 + }, + { + "epoch": 2.2180384175041343, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6311147212982178, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8801918029785156, + "num_tokens": 665115027.0, + "step": 17436 + }, + { + "epoch": 2.218165627782725, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6915255784988403, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8853549957275391, + "num_tokens": 665148218.0, + "step": 17437 + }, + { + "epoch": 2.2182928380613154, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6789301633834839, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.864592969417572, + "num_tokens": 665189408.0, + "step": 17438 + }, + { + "epoch": 2.218420048339906, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6927080154418945, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8810723423957825, + "num_tokens": 665228731.0, + "step": 17439 + }, + { + "epoch": 2.2185472586184964, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6751803159713745, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8725346326828003, + "num_tokens": 665266874.0, + "step": 17440 + }, + { + "epoch": 2.218674468897087, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.634879231452942, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8678808808326721, + "num_tokens": 665305188.0, + "step": 17441 + }, + { + "epoch": 2.2188016791756775, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.4678394794464111, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8719879388809204, + "num_tokens": 665348755.0, + "step": 17442 + }, + { + "epoch": 2.218928889454268, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7687798738479614, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8846341371536255, + "num_tokens": 665380061.0, + "step": 17443 + }, + { + "epoch": 2.2190560997328586, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.631106972694397, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.872955322265625, + "num_tokens": 665418346.0, + "step": 17444 + }, + { + "epoch": 2.219183310011449, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6367219686508179, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8669594526290894, + "num_tokens": 665460350.0, + "step": 17445 + }, + { + "epoch": 2.2193105202900396, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5709081888198853, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8861993551254272, + "num_tokens": 665498270.0, + "step": 17446 + }, + { + "epoch": 2.21943773056863, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7500050067901611, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8610548973083496, + "num_tokens": 665535934.0, + "step": 17447 + }, + { + "epoch": 2.2195649408472207, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5621849298477173, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.888195276260376, + "num_tokens": 665578383.0, + "step": 17448 + }, + { + "epoch": 2.2196921511258108, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6140626668930054, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8655829429626465, + "num_tokens": 665622693.0, + "step": 17449 + }, + { + "epoch": 2.2198193614044013, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8293344974517822, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8793550729751587, + "num_tokens": 665654663.0, + "step": 17450 + }, + { + "epoch": 2.219946571682992, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6799598932266235, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8705630898475647, + "num_tokens": 665691950.0, + "step": 17451 + }, + { + "epoch": 2.2200737819615823, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6781376600265503, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8823250532150269, + "num_tokens": 665727062.0, + "step": 17452 + }, + { + "epoch": 2.220200992240173, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.577340841293335, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8852813839912415, + "num_tokens": 665766531.0, + "step": 17453 + }, + { + "epoch": 2.2203282025187634, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6850285530090332, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8755097389221191, + "num_tokens": 665801057.0, + "step": 17454 + }, + { + "epoch": 2.220455412797354, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5354738235473633, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8857628703117371, + "num_tokens": 665837947.0, + "step": 17455 + }, + { + "epoch": 2.2205826230759445, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6223247051239014, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.874285101890564, + "num_tokens": 665874172.0, + "step": 17456 + }, + { + "epoch": 2.220709833354535, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5512274503707886, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8758073449134827, + "num_tokens": 665917258.0, + "step": 17457 + }, + { + "epoch": 2.2208370436331255, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6702836751937866, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8625072240829468, + "num_tokens": 665956940.0, + "step": 17458 + }, + { + "epoch": 2.220964253911716, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.726783275604248, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8737055659294128, + "num_tokens": 665993283.0, + "step": 17459 + }, + { + "epoch": 2.2210914641903066, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5079411268234253, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8836204409599304, + "num_tokens": 666035841.0, + "step": 17460 + }, + { + "epoch": 2.221218674468897, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5731191635131836, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8816883563995361, + "num_tokens": 666074919.0, + "step": 17461 + }, + { + "epoch": 2.2213458847474876, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6610186100006104, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8682253360748291, + "num_tokens": 666112074.0, + "step": 17462 + }, + { + "epoch": 2.221473095026078, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6556092500686646, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8853949308395386, + "num_tokens": 666151439.0, + "step": 17463 + }, + { + "epoch": 2.2216003053046687, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5649127960205078, + "learning_rate": 1e-06, + "loss": 0.3217, + "mean_token_accuracy": 0.8972121477127075, + "num_tokens": 666191492.0, + "step": 17464 + }, + { + "epoch": 2.221727515583259, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.578721284866333, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8775119781494141, + "num_tokens": 666234333.0, + "step": 17465 + }, + { + "epoch": 2.2218547258618497, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.69454824924469, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8642809391021729, + "num_tokens": 666274464.0, + "step": 17466 + }, + { + "epoch": 2.2219819361404403, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.749235987663269, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8885323405265808, + "num_tokens": 666310669.0, + "step": 17467 + }, + { + "epoch": 2.222109146419031, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5650925636291504, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8796595335006714, + "num_tokens": 666353330.0, + "step": 17468 + }, + { + "epoch": 2.2222363566976213, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6128876209259033, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8813812732696533, + "num_tokens": 666393967.0, + "step": 17469 + }, + { + "epoch": 2.222363566976212, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6436177492141724, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8535524606704712, + "num_tokens": 666438339.0, + "step": 17470 + }, + { + "epoch": 2.2224907772548024, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.651061773300171, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.876828134059906, + "num_tokens": 666477635.0, + "step": 17471 + }, + { + "epoch": 2.222617987533393, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6355539560317993, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8649812936782837, + "num_tokens": 666519658.0, + "step": 17472 + }, + { + "epoch": 2.2227451978119834, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.650445580482483, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8774943351745605, + "num_tokens": 666558886.0, + "step": 17473 + }, + { + "epoch": 2.2228724080905735, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.587167739868164, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8749529719352722, + "num_tokens": 666596822.0, + "step": 17474 + }, + { + "epoch": 2.222999618369164, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6861072778701782, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8781836032867432, + "num_tokens": 666636203.0, + "step": 17475 + }, + { + "epoch": 2.2231268286477546, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6204900741577148, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8771940469741821, + "num_tokens": 666674207.0, + "step": 17476 + }, + { + "epoch": 2.223254038926345, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5473600625991821, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8588471412658691, + "num_tokens": 666715897.0, + "step": 17477 + }, + { + "epoch": 2.2233812492049356, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8439133167266846, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8731285929679871, + "num_tokens": 666754695.0, + "step": 17478 + }, + { + "epoch": 2.223508459483526, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.592929720878601, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8815279006958008, + "num_tokens": 666793532.0, + "step": 17479 + }, + { + "epoch": 2.2236356697621167, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5533298254013062, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8835850954055786, + "num_tokens": 666835588.0, + "step": 17480 + }, + { + "epoch": 2.223762880040707, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6255700588226318, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8716665506362915, + "num_tokens": 666874429.0, + "step": 17481 + }, + { + "epoch": 2.2238900903192977, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8080857992172241, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8758624792098999, + "num_tokens": 666906643.0, + "step": 17482 + }, + { + "epoch": 2.2240173005978883, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7617641687393188, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8703575134277344, + "num_tokens": 666942504.0, + "step": 17483 + }, + { + "epoch": 2.224144510876479, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.637751817703247, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8843598365783691, + "num_tokens": 666977554.0, + "step": 17484 + }, + { + "epoch": 2.2242717211550693, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5479321479797363, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8766993284225464, + "num_tokens": 667020267.0, + "step": 17485 + }, + { + "epoch": 2.22439893143366, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.776417851448059, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8776575326919556, + "num_tokens": 667053035.0, + "step": 17486 + }, + { + "epoch": 2.2245261417122504, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6044362783432007, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8838231563568115, + "num_tokens": 667090758.0, + "step": 17487 + }, + { + "epoch": 2.224653351990841, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5537651777267456, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8759119510650635, + "num_tokens": 667133092.0, + "step": 17488 + }, + { + "epoch": 2.2247805622694314, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6290160417556763, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8813309073448181, + "num_tokens": 667170671.0, + "step": 17489 + }, + { + "epoch": 2.224907772548022, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8055697679519653, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.87165367603302, + "num_tokens": 667205944.0, + "step": 17490 + }, + { + "epoch": 2.2250349828266125, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7970942258834839, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8603215217590332, + "num_tokens": 667242452.0, + "step": 17491 + }, + { + "epoch": 2.225162193105203, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.529535174369812, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8794551491737366, + "num_tokens": 667284036.0, + "step": 17492 + }, + { + "epoch": 2.2252894033837936, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7940369844436646, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8475028872489929, + "num_tokens": 667321462.0, + "step": 17493 + }, + { + "epoch": 2.225416613662384, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7255773544311523, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8553440570831299, + "num_tokens": 667358291.0, + "step": 17494 + }, + { + "epoch": 2.2255438239409746, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6742404699325562, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8739693760871887, + "num_tokens": 667396934.0, + "step": 17495 + }, + { + "epoch": 2.225671034219565, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.701094627380371, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8671005964279175, + "num_tokens": 667435011.0, + "step": 17496 + }, + { + "epoch": 2.225798244498155, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5218915939331055, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8729755878448486, + "num_tokens": 667479751.0, + "step": 17497 + }, + { + "epoch": 2.225925454776746, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6461039781570435, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8771666288375854, + "num_tokens": 667517508.0, + "step": 17498 + }, + { + "epoch": 2.2260526650553363, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8266521692276, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8752516508102417, + "num_tokens": 667549018.0, + "step": 17499 + }, + { + "epoch": 2.226179875333927, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6360725164413452, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8782308101654053, + "num_tokens": 667588000.0, + "step": 17500 + }, + { + "epoch": 2.2263070856125173, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.646362066268921, + "learning_rate": 1e-06, + "loss": 0.3452, + "mean_token_accuracy": 0.8869805335998535, + "num_tokens": 667623951.0, + "step": 17501 + }, + { + "epoch": 2.226434295891108, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6218360662460327, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8764902353286743, + "num_tokens": 667664586.0, + "step": 17502 + }, + { + "epoch": 2.2265615061696984, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5721502304077148, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8819802403450012, + "num_tokens": 667705602.0, + "step": 17503 + }, + { + "epoch": 2.226688716448289, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6857001781463623, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.88853919506073, + "num_tokens": 667739596.0, + "step": 17504 + }, + { + "epoch": 2.2268159267268794, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.690200924873352, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8812173008918762, + "num_tokens": 667774159.0, + "step": 17505 + }, + { + "epoch": 2.22694313700547, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5784616470336914, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8715800046920776, + "num_tokens": 667815842.0, + "step": 17506 + }, + { + "epoch": 2.2270703472840605, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.705483317375183, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8805446624755859, + "num_tokens": 667848065.0, + "step": 17507 + }, + { + "epoch": 2.227197557562651, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.764539361000061, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8639408349990845, + "num_tokens": 667888594.0, + "step": 17508 + }, + { + "epoch": 2.2273247678412416, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6665891408920288, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8820315599441528, + "num_tokens": 667925327.0, + "step": 17509 + }, + { + "epoch": 2.227451978119832, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.9103606939315796, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8705071210861206, + "num_tokens": 667955685.0, + "step": 17510 + }, + { + "epoch": 2.2275791883984226, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6794164180755615, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8567997217178345, + "num_tokens": 667994765.0, + "step": 17511 + }, + { + "epoch": 2.227706398677013, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5827250480651855, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8763047456741333, + "num_tokens": 668034843.0, + "step": 17512 + }, + { + "epoch": 2.2278336089556037, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7635611295700073, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8677099943161011, + "num_tokens": 668074773.0, + "step": 17513 + }, + { + "epoch": 2.227960819234194, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7643332481384277, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8734899759292603, + "num_tokens": 668111448.0, + "step": 17514 + }, + { + "epoch": 2.2280880295127847, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5498701333999634, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.893255889415741, + "num_tokens": 668150452.0, + "step": 17515 + }, + { + "epoch": 2.2282152397913753, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7144256830215454, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8810305595397949, + "num_tokens": 668187353.0, + "step": 17516 + }, + { + "epoch": 2.228342450069966, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5702033042907715, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8654638528823853, + "num_tokens": 668232937.0, + "step": 17517 + }, + { + "epoch": 2.2284696603485563, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5206377506256104, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8822168111801147, + "num_tokens": 668274852.0, + "step": 17518 + }, + { + "epoch": 2.228596870627147, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.569257140159607, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8783817291259766, + "num_tokens": 668315226.0, + "step": 17519 + }, + { + "epoch": 2.2287240809057374, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7524638175964355, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.871049702167511, + "num_tokens": 668350161.0, + "step": 17520 + }, + { + "epoch": 2.228851291184328, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7055777311325073, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8719281554222107, + "num_tokens": 668387212.0, + "step": 17521 + }, + { + "epoch": 2.228978501462918, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.672881007194519, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8626222610473633, + "num_tokens": 668425670.0, + "step": 17522 + }, + { + "epoch": 2.2291057117415085, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6872990131378174, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8854086399078369, + "num_tokens": 668459449.0, + "step": 17523 + }, + { + "epoch": 2.229232922020099, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6124186515808105, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8642813563346863, + "num_tokens": 668500049.0, + "step": 17524 + }, + { + "epoch": 2.2293601322986896, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7009267807006836, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8686613440513611, + "num_tokens": 668538300.0, + "step": 17525 + }, + { + "epoch": 2.22948734257728, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5875362157821655, + "learning_rate": 1e-06, + "loss": 0.324, + "mean_token_accuracy": 0.8937323093414307, + "num_tokens": 668574240.0, + "step": 17526 + }, + { + "epoch": 2.2296145528558706, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5569196939468384, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.875275194644928, + "num_tokens": 668616008.0, + "step": 17527 + }, + { + "epoch": 2.229741763134461, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6753122806549072, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8746698498725891, + "num_tokens": 668648969.0, + "step": 17528 + }, + { + "epoch": 2.2298689734130517, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6519453525543213, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8695296049118042, + "num_tokens": 668689753.0, + "step": 17529 + }, + { + "epoch": 2.229996183691642, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.547408938407898, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.879420280456543, + "num_tokens": 668729618.0, + "step": 17530 + }, + { + "epoch": 2.2301233939702327, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7764638662338257, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8765764832496643, + "num_tokens": 668762995.0, + "step": 17531 + }, + { + "epoch": 2.2302506042488233, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.690121054649353, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.856055498123169, + "num_tokens": 668802741.0, + "step": 17532 + }, + { + "epoch": 2.230377814527414, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6618423461914062, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8876132369041443, + "num_tokens": 668838794.0, + "step": 17533 + }, + { + "epoch": 2.2305050248060043, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6032235622406006, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8765260577201843, + "num_tokens": 668879676.0, + "step": 17534 + }, + { + "epoch": 2.230632235084595, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.579474925994873, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8771522641181946, + "num_tokens": 668921103.0, + "step": 17535 + }, + { + "epoch": 2.2307594453631854, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6906428337097168, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8867099285125732, + "num_tokens": 668953511.0, + "step": 17536 + }, + { + "epoch": 2.230886655641776, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5932543277740479, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8841650485992432, + "num_tokens": 668992131.0, + "step": 17537 + }, + { + "epoch": 2.2310138659203664, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.676148533821106, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8825556635856628, + "num_tokens": 669027064.0, + "step": 17538 + }, + { + "epoch": 2.231141076198957, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6079716682434082, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8724942207336426, + "num_tokens": 669066664.0, + "step": 17539 + }, + { + "epoch": 2.2312682864775475, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6757663488388062, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8684494495391846, + "num_tokens": 669106368.0, + "step": 17540 + }, + { + "epoch": 2.231395496756138, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5653167963027954, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8823379278182983, + "num_tokens": 669144754.0, + "step": 17541 + }, + { + "epoch": 2.2315227070347285, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6564700603485107, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8743717670440674, + "num_tokens": 669183191.0, + "step": 17542 + }, + { + "epoch": 2.231649917313319, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5366297960281372, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8863856792449951, + "num_tokens": 669219822.0, + "step": 17543 + }, + { + "epoch": 2.2317771275919096, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7595124244689941, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8819480538368225, + "num_tokens": 669252320.0, + "step": 17544 + }, + { + "epoch": 2.2319043378705, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6912786960601807, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.869544267654419, + "num_tokens": 669288382.0, + "step": 17545 + }, + { + "epoch": 2.2320315481490907, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6561708450317383, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8748036026954651, + "num_tokens": 669325839.0, + "step": 17546 + }, + { + "epoch": 2.2321587584276807, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5948984622955322, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8712366819381714, + "num_tokens": 669364177.0, + "step": 17547 + }, + { + "epoch": 2.2322859687062713, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6211930513381958, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8766088485717773, + "num_tokens": 669401558.0, + "step": 17548 + }, + { + "epoch": 2.232413178984862, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5187623500823975, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8824461698532104, + "num_tokens": 669441893.0, + "step": 17549 + }, + { + "epoch": 2.2325403892634523, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.564087986946106, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8699829578399658, + "num_tokens": 669483492.0, + "step": 17550 + }, + { + "epoch": 2.232667599542043, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5545731782913208, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8798896074295044, + "num_tokens": 669528723.0, + "step": 17551 + }, + { + "epoch": 2.2327948098206334, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.69356107711792, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8716458082199097, + "num_tokens": 669564054.0, + "step": 17552 + }, + { + "epoch": 2.232922020099224, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5967633724212646, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8815066814422607, + "num_tokens": 669598841.0, + "step": 17553 + }, + { + "epoch": 2.2330492303778144, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.681666374206543, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8774784207344055, + "num_tokens": 669636205.0, + "step": 17554 + }, + { + "epoch": 2.233176440656405, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6682522296905518, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.873788595199585, + "num_tokens": 669672598.0, + "step": 17555 + }, + { + "epoch": 2.2333036509349955, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.814866304397583, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8625889420509338, + "num_tokens": 669706158.0, + "step": 17556 + }, + { + "epoch": 2.233430861213586, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.634569764137268, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.874670147895813, + "num_tokens": 669742732.0, + "step": 17557 + }, + { + "epoch": 2.2335580714921766, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.8071898221969604, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8571345210075378, + "num_tokens": 669783608.0, + "step": 17558 + }, + { + "epoch": 2.233685281770767, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.562663197517395, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8814989328384399, + "num_tokens": 669827421.0, + "step": 17559 + }, + { + "epoch": 2.2338124920493576, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.72371506690979, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8707525730133057, + "num_tokens": 669864270.0, + "step": 17560 + }, + { + "epoch": 2.233939702327948, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6664278507232666, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8608463406562805, + "num_tokens": 669904993.0, + "step": 17561 + }, + { + "epoch": 2.2340669126065387, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6080567836761475, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8644342422485352, + "num_tokens": 669947173.0, + "step": 17562 + }, + { + "epoch": 2.234194122885129, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6738550662994385, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8640488982200623, + "num_tokens": 669991365.0, + "step": 17563 + }, + { + "epoch": 2.2343213331637197, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6399717330932617, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8735222220420837, + "num_tokens": 670029043.0, + "step": 17564 + }, + { + "epoch": 2.2344485434423103, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6749303340911865, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8748643398284912, + "num_tokens": 670063776.0, + "step": 17565 + }, + { + "epoch": 2.234575753720901, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.592099905014038, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.862760066986084, + "num_tokens": 670106271.0, + "step": 17566 + }, + { + "epoch": 2.2347029639994913, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7466940879821777, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8644297122955322, + "num_tokens": 670147230.0, + "step": 17567 + }, + { + "epoch": 2.234830174278082, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.54966139793396, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.881913959980011, + "num_tokens": 670188131.0, + "step": 17568 + }, + { + "epoch": 2.2349573845566724, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.776131272315979, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8721084594726562, + "num_tokens": 670221324.0, + "step": 17569 + }, + { + "epoch": 2.235084594835263, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.4887357950210571, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8875473737716675, + "num_tokens": 670263533.0, + "step": 17570 + }, + { + "epoch": 2.2352118051138534, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.660521149635315, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8811836838722229, + "num_tokens": 670298685.0, + "step": 17571 + }, + { + "epoch": 2.2353390153924435, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5808773040771484, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8804117441177368, + "num_tokens": 670340584.0, + "step": 17572 + }, + { + "epoch": 2.235466225671034, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7208232879638672, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8715068101882935, + "num_tokens": 670381353.0, + "step": 17573 + }, + { + "epoch": 2.2355934359496246, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5435885190963745, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8719632625579834, + "num_tokens": 670425540.0, + "step": 17574 + }, + { + "epoch": 2.235720646228215, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5928689241409302, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8652077317237854, + "num_tokens": 670467851.0, + "step": 17575 + }, + { + "epoch": 2.2358478565068056, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6527785062789917, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8785814046859741, + "num_tokens": 670504990.0, + "step": 17576 + }, + { + "epoch": 2.235975066785396, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6278810501098633, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8677771091461182, + "num_tokens": 670546734.0, + "step": 17577 + }, + { + "epoch": 2.2361022770639867, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.622973084449768, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8823800086975098, + "num_tokens": 670586515.0, + "step": 17578 + }, + { + "epoch": 2.236229487342577, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6822932958602905, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8721063137054443, + "num_tokens": 670623997.0, + "step": 17579 + }, + { + "epoch": 2.2363566976211677, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.616308569908142, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8776196241378784, + "num_tokens": 670664096.0, + "step": 17580 + }, + { + "epoch": 2.2364839078997583, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6598106622695923, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8733935952186584, + "num_tokens": 670699698.0, + "step": 17581 + }, + { + "epoch": 2.236611118178349, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7030243873596191, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8650888204574585, + "num_tokens": 670737305.0, + "step": 17582 + }, + { + "epoch": 2.2367383284569393, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7629896402359009, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.866239070892334, + "num_tokens": 670772563.0, + "step": 17583 + }, + { + "epoch": 2.23686553873553, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6907514333724976, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8664987087249756, + "num_tokens": 670810234.0, + "step": 17584 + }, + { + "epoch": 2.2369927490141204, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5582681894302368, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.879023551940918, + "num_tokens": 670850045.0, + "step": 17585 + }, + { + "epoch": 2.237119959292711, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6338348388671875, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8680384159088135, + "num_tokens": 670888719.0, + "step": 17586 + }, + { + "epoch": 2.2372471695713014, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.4990257024765015, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8789107799530029, + "num_tokens": 670929401.0, + "step": 17587 + }, + { + "epoch": 2.237374379849892, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.616689682006836, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.874501645565033, + "num_tokens": 670967918.0, + "step": 17588 + }, + { + "epoch": 2.2375015901284825, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7056148052215576, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8737567663192749, + "num_tokens": 671001729.0, + "step": 17589 + }, + { + "epoch": 2.237628800407073, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.593788981437683, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8830262422561646, + "num_tokens": 671040367.0, + "step": 17590 + }, + { + "epoch": 2.2377560106856635, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.645203709602356, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8639472126960754, + "num_tokens": 671080865.0, + "step": 17591 + }, + { + "epoch": 2.237883220964254, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6969530582427979, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8850611448287964, + "num_tokens": 671114286.0, + "step": 17592 + }, + { + "epoch": 2.2380104312428446, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.5516140460968018, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8696452975273132, + "num_tokens": 671158823.0, + "step": 17593 + }, + { + "epoch": 2.238137641521435, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6591973304748535, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8865387439727783, + "num_tokens": 671193731.0, + "step": 17594 + }, + { + "epoch": 2.238264851800025, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 16.702625274658203, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8656103610992432, + "num_tokens": 671232788.0, + "step": 17595 + }, + { + "epoch": 2.238392062078616, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6458635330200195, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.880072832107544, + "num_tokens": 671272135.0, + "step": 17596 + }, + { + "epoch": 2.2385192723572063, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.551166296005249, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8752573728561401, + "num_tokens": 671317274.0, + "step": 17597 + }, + { + "epoch": 2.238646482635797, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6666789054870605, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.872588574886322, + "num_tokens": 671354425.0, + "step": 17598 + }, + { + "epoch": 2.2387736929143873, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6745631694793701, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8721972703933716, + "num_tokens": 671395030.0, + "step": 17599 + }, + { + "epoch": 2.238900903192978, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.7001540660858154, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8737157583236694, + "num_tokens": 671431069.0, + "step": 17600 + }, + { + "epoch": 2.2390281134715684, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6361066102981567, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8759321570396423, + "num_tokens": 671468458.0, + "step": 17601 + }, + { + "epoch": 2.239155323750159, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6673040390014648, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8858976364135742, + "num_tokens": 671505952.0, + "step": 17602 + }, + { + "epoch": 2.2392825340287494, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5557737350463867, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8846831321716309, + "num_tokens": 671547721.0, + "step": 17603 + }, + { + "epoch": 2.23940974430734, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5207695960998535, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8748661875724792, + "num_tokens": 671590413.0, + "step": 17604 + }, + { + "epoch": 2.2395369545859305, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6205406188964844, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8726625442504883, + "num_tokens": 671631546.0, + "step": 17605 + }, + { + "epoch": 2.239664164864521, + "ewc_loss": 2.7179718017578125e-05, + "grad_norm": 1.6116036176681519, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8684767484664917, + "num_tokens": 671668476.0, + "step": 17606 + }, + { + "epoch": 2.2397913751431116, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.708226203918457, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8850671052932739, + "num_tokens": 671703516.0, + "step": 17607 + }, + { + "epoch": 2.239918585421702, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6296662092208862, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8788503408432007, + "num_tokens": 671742549.0, + "step": 17608 + }, + { + "epoch": 2.2400457957002926, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5638513565063477, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8759267926216125, + "num_tokens": 671783986.0, + "step": 17609 + }, + { + "epoch": 2.240173005978883, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6893519163131714, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8744651079177856, + "num_tokens": 671825250.0, + "step": 17610 + }, + { + "epoch": 2.2403002162574737, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7417100667953491, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8546276092529297, + "num_tokens": 671862655.0, + "step": 17611 + }, + { + "epoch": 2.240427426536064, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7747834920883179, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8564107418060303, + "num_tokens": 671899500.0, + "step": 17612 + }, + { + "epoch": 2.2405546368146547, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7077611684799194, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8813965320587158, + "num_tokens": 671934285.0, + "step": 17613 + }, + { + "epoch": 2.2406818470932452, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6898077726364136, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8895021677017212, + "num_tokens": 671968704.0, + "step": 17614 + }, + { + "epoch": 2.2408090573718358, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5369709730148315, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8759669065475464, + "num_tokens": 672010755.0, + "step": 17615 + }, + { + "epoch": 2.2409362676504263, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7191170454025269, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8607789278030396, + "num_tokens": 672049164.0, + "step": 17616 + }, + { + "epoch": 2.241063477929017, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.675416111946106, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8735789060592651, + "num_tokens": 672085215.0, + "step": 17617 + }, + { + "epoch": 2.2411906882076074, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5584923028945923, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.876976490020752, + "num_tokens": 672125167.0, + "step": 17618 + }, + { + "epoch": 2.241317898486198, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6480138301849365, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8777786493301392, + "num_tokens": 672164377.0, + "step": 17619 + }, + { + "epoch": 2.241445108764788, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.609241247177124, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8812810182571411, + "num_tokens": 672204417.0, + "step": 17620 + }, + { + "epoch": 2.2415723190433785, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6211811304092407, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8689602613449097, + "num_tokens": 672243156.0, + "step": 17621 + }, + { + "epoch": 2.241699529321969, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.766379952430725, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8732280135154724, + "num_tokens": 672277690.0, + "step": 17622 + }, + { + "epoch": 2.2418267396005596, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5777108669281006, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8867084980010986, + "num_tokens": 672314735.0, + "step": 17623 + }, + { + "epoch": 2.24195394987915, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5834338665008545, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8882231712341309, + "num_tokens": 672350571.0, + "step": 17624 + }, + { + "epoch": 2.2420811601577406, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6828718185424805, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8649951815605164, + "num_tokens": 672390121.0, + "step": 17625 + }, + { + "epoch": 2.242208370436331, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6704250574111938, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8691114187240601, + "num_tokens": 672424221.0, + "step": 17626 + }, + { + "epoch": 2.2423355807149217, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6600788831710815, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8787869215011597, + "num_tokens": 672459381.0, + "step": 17627 + }, + { + "epoch": 2.242462790993512, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5518730878829956, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8754674196243286, + "num_tokens": 672502347.0, + "step": 17628 + }, + { + "epoch": 2.2425900012721027, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5942248106002808, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8707670569419861, + "num_tokens": 672543396.0, + "step": 17629 + }, + { + "epoch": 2.2427172115506933, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7701255083084106, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8650841116905212, + "num_tokens": 672582677.0, + "step": 17630 + }, + { + "epoch": 2.242844421829284, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7266772985458374, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8706899881362915, + "num_tokens": 672620564.0, + "step": 17631 + }, + { + "epoch": 2.2429716321078743, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.4517558813095093, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8816356658935547, + "num_tokens": 672665187.0, + "step": 17632 + }, + { + "epoch": 2.243098842386465, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6357059478759766, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8839789628982544, + "num_tokens": 672700432.0, + "step": 17633 + }, + { + "epoch": 2.2432260526650554, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6834838390350342, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8775363564491272, + "num_tokens": 672736327.0, + "step": 17634 + }, + { + "epoch": 2.243353262943646, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7793453931808472, + "learning_rate": 1e-06, + "loss": 0.3311, + "mean_token_accuracy": 0.890786349773407, + "num_tokens": 672768864.0, + "step": 17635 + }, + { + "epoch": 2.2434804732222364, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.838849663734436, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8555054664611816, + "num_tokens": 672802743.0, + "step": 17636 + }, + { + "epoch": 2.243607683500827, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5980501174926758, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.867982029914856, + "num_tokens": 672847570.0, + "step": 17637 + }, + { + "epoch": 2.2437348937794175, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.9526180028915405, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8807754516601562, + "num_tokens": 672878709.0, + "step": 17638 + }, + { + "epoch": 2.243862104058008, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6016842126846313, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8847193717956543, + "num_tokens": 672920082.0, + "step": 17639 + }, + { + "epoch": 2.2439893143365985, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6430023908615112, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8732044696807861, + "num_tokens": 672959516.0, + "step": 17640 + }, + { + "epoch": 2.244116524615189, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5389513969421387, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8761829137802124, + "num_tokens": 673003870.0, + "step": 17641 + }, + { + "epoch": 2.2442437348937796, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5769846439361572, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8763445019721985, + "num_tokens": 673046789.0, + "step": 17642 + }, + { + "epoch": 2.24437094517237, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 3.3768022060394287, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8756990432739258, + "num_tokens": 673090317.0, + "step": 17643 + }, + { + "epoch": 2.2444981554509607, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5541777610778809, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8895808458328247, + "num_tokens": 673127215.0, + "step": 17644 + }, + { + "epoch": 2.2446253657295507, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7922343015670776, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8808372020721436, + "num_tokens": 673162524.0, + "step": 17645 + }, + { + "epoch": 2.2447525760081413, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.713850736618042, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8642874956130981, + "num_tokens": 673205998.0, + "step": 17646 + }, + { + "epoch": 2.244879786286732, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7400538921356201, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8746587038040161, + "num_tokens": 673240618.0, + "step": 17647 + }, + { + "epoch": 2.2450069965653223, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5373921394348145, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8793243169784546, + "num_tokens": 673283827.0, + "step": 17648 + }, + { + "epoch": 2.245134206843913, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.446056842803955, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8894392251968384, + "num_tokens": 673326252.0, + "step": 17649 + }, + { + "epoch": 2.2452614171225034, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7195039987564087, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8639264106750488, + "num_tokens": 673365384.0, + "step": 17650 + }, + { + "epoch": 2.245388627401094, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8164567947387695, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8824214935302734, + "num_tokens": 673401247.0, + "step": 17651 + }, + { + "epoch": 2.2455158376796844, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 2.1829938888549805, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8963676691055298, + "num_tokens": 673437938.0, + "step": 17652 + }, + { + "epoch": 2.245643047958275, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7612332105636597, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8719710111618042, + "num_tokens": 673471028.0, + "step": 17653 + }, + { + "epoch": 2.2457702582368655, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.690775990486145, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.874121904373169, + "num_tokens": 673510319.0, + "step": 17654 + }, + { + "epoch": 2.245897468515456, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6349127292633057, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8824304342269897, + "num_tokens": 673548030.0, + "step": 17655 + }, + { + "epoch": 2.2460246787940465, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5177119970321655, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8885668516159058, + "num_tokens": 673590541.0, + "step": 17656 + }, + { + "epoch": 2.246151889072637, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7153834104537964, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.872279167175293, + "num_tokens": 673624122.0, + "step": 17657 + }, + { + "epoch": 2.2462790993512276, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6633909940719604, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8725976943969727, + "num_tokens": 673665784.0, + "step": 17658 + }, + { + "epoch": 2.246406309629818, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5791831016540527, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8687847256660461, + "num_tokens": 673705256.0, + "step": 17659 + }, + { + "epoch": 2.2465335199084087, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.692683219909668, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.864215612411499, + "num_tokens": 673744086.0, + "step": 17660 + }, + { + "epoch": 2.246660730186999, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6893837451934814, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8772077560424805, + "num_tokens": 673785577.0, + "step": 17661 + }, + { + "epoch": 2.2467879404655897, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5464798212051392, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8710525035858154, + "num_tokens": 673829138.0, + "step": 17662 + }, + { + "epoch": 2.2469151507441802, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7930231094360352, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8683002591133118, + "num_tokens": 673860973.0, + "step": 17663 + }, + { + "epoch": 2.2470423610227708, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.557393193244934, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8871753215789795, + "num_tokens": 673900139.0, + "step": 17664 + }, + { + "epoch": 2.2471695713013613, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7145246267318726, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8605290651321411, + "num_tokens": 673944677.0, + "step": 17665 + }, + { + "epoch": 2.247296781579952, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6935893297195435, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8657331466674805, + "num_tokens": 673988844.0, + "step": 17666 + }, + { + "epoch": 2.2474239918585424, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5443094968795776, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8792227506637573, + "num_tokens": 674032483.0, + "step": 17667 + }, + { + "epoch": 2.247551202137133, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8257496356964111, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8594859838485718, + "num_tokens": 674068741.0, + "step": 17668 + }, + { + "epoch": 2.2476784124157234, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6017954349517822, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8830488324165344, + "num_tokens": 674112227.0, + "step": 17669 + }, + { + "epoch": 2.2478056226943135, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 2.394871473312378, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8677877187728882, + "num_tokens": 674149918.0, + "step": 17670 + }, + { + "epoch": 2.247932832972904, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.9245762825012207, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8659610748291016, + "num_tokens": 674182689.0, + "step": 17671 + }, + { + "epoch": 2.2480600432514946, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7131567001342773, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8606787919998169, + "num_tokens": 674222713.0, + "step": 17672 + }, + { + "epoch": 2.248187253530085, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6905382871627808, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8750087022781372, + "num_tokens": 674260105.0, + "step": 17673 + }, + { + "epoch": 2.2483144638086756, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.779460072517395, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8705192804336548, + "num_tokens": 674298133.0, + "step": 17674 + }, + { + "epoch": 2.248441674087266, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6374629735946655, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8828816413879395, + "num_tokens": 674335246.0, + "step": 17675 + }, + { + "epoch": 2.2485688843658567, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6411312818527222, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8796690702438354, + "num_tokens": 674375003.0, + "step": 17676 + }, + { + "epoch": 2.248696094644447, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7336617708206177, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8706799149513245, + "num_tokens": 674410244.0, + "step": 17677 + }, + { + "epoch": 2.2488233049230377, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.701141119003296, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8633862733840942, + "num_tokens": 674450044.0, + "step": 17678 + }, + { + "epoch": 2.2489505152016283, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5756585597991943, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8830666542053223, + "num_tokens": 674490507.0, + "step": 17679 + }, + { + "epoch": 2.249077725480219, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6506966352462769, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8877778053283691, + "num_tokens": 674525417.0, + "step": 17680 + }, + { + "epoch": 2.2492049357588093, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7794820070266724, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8776353597640991, + "num_tokens": 674560446.0, + "step": 17681 + }, + { + "epoch": 2.2493321460374, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6747466325759888, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8800641298294067, + "num_tokens": 674597527.0, + "step": 17682 + }, + { + "epoch": 2.2494593563159904, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7146844863891602, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8726521134376526, + "num_tokens": 674635444.0, + "step": 17683 + }, + { + "epoch": 2.249586566594581, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.637908697128296, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8786762952804565, + "num_tokens": 674672659.0, + "step": 17684 + }, + { + "epoch": 2.2497137768731714, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6401716470718384, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8807565569877625, + "num_tokens": 674711214.0, + "step": 17685 + }, + { + "epoch": 2.249840987151762, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.723899245262146, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8607066869735718, + "num_tokens": 674746744.0, + "step": 17686 + }, + { + "epoch": 2.2499681974303525, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8380411863327026, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8591514825820923, + "num_tokens": 674782183.0, + "step": 17687 + }, + { + "epoch": 2.250095407708943, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6339861154556274, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8820554614067078, + "num_tokens": 674822795.0, + "step": 17688 + }, + { + "epoch": 2.2502226179875335, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8386878967285156, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8618270754814148, + "num_tokens": 674857441.0, + "step": 17689 + }, + { + "epoch": 2.250349828266124, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6763834953308105, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8680626153945923, + "num_tokens": 674896144.0, + "step": 17690 + }, + { + "epoch": 2.2504770385447146, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7084132432937622, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8775618076324463, + "num_tokens": 674931104.0, + "step": 17691 + }, + { + "epoch": 2.250604248823305, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6036386489868164, + "learning_rate": 1e-06, + "loss": 0.3553, + "mean_token_accuracy": 0.8837630152702332, + "num_tokens": 674966541.0, + "step": 17692 + }, + { + "epoch": 2.250731459101895, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8142575025558472, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8729215860366821, + "num_tokens": 675003270.0, + "step": 17693 + }, + { + "epoch": 2.250858669380486, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5340495109558105, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8769792318344116, + "num_tokens": 675044470.0, + "step": 17694 + }, + { + "epoch": 2.2509858796590763, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5983046293258667, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8689469695091248, + "num_tokens": 675082750.0, + "step": 17695 + }, + { + "epoch": 2.251113089937667, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7649849653244019, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8635371327400208, + "num_tokens": 675119160.0, + "step": 17696 + }, + { + "epoch": 2.2512403002162573, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6834831237792969, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8640952110290527, + "num_tokens": 675157850.0, + "step": 17697 + }, + { + "epoch": 2.251367510494848, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5508852005004883, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8795098662376404, + "num_tokens": 675198523.0, + "step": 17698 + }, + { + "epoch": 2.2514947207734384, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6879528760910034, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.854819118976593, + "num_tokens": 675236309.0, + "step": 17699 + }, + { + "epoch": 2.251621931052029, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7733373641967773, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8610143065452576, + "num_tokens": 675273440.0, + "step": 17700 + }, + { + "epoch": 2.2517491413306194, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7156593799591064, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8896808624267578, + "num_tokens": 675306129.0, + "step": 17701 + }, + { + "epoch": 2.25187635160921, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6000044345855713, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8828151226043701, + "num_tokens": 675345955.0, + "step": 17702 + }, + { + "epoch": 2.2520035618878005, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6665613651275635, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8674638271331787, + "num_tokens": 675388204.0, + "step": 17703 + }, + { + "epoch": 2.252130772166391, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6770402193069458, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8685067892074585, + "num_tokens": 675424478.0, + "step": 17704 + }, + { + "epoch": 2.2522579824449815, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5213109254837036, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.877149224281311, + "num_tokens": 675465282.0, + "step": 17705 + }, + { + "epoch": 2.252385192723572, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5819952487945557, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8754141330718994, + "num_tokens": 675503012.0, + "step": 17706 + }, + { + "epoch": 2.2525124030021626, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5417630672454834, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8710097074508667, + "num_tokens": 675547407.0, + "step": 17707 + }, + { + "epoch": 2.252639613280753, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6827431917190552, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8768770694732666, + "num_tokens": 675584545.0, + "step": 17708 + }, + { + "epoch": 2.2527668235593437, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.686875581741333, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8727717399597168, + "num_tokens": 675620111.0, + "step": 17709 + }, + { + "epoch": 2.252894033837934, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6949634552001953, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8699564337730408, + "num_tokens": 675657396.0, + "step": 17710 + }, + { + "epoch": 2.2530212441165247, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7584620714187622, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8881173133850098, + "num_tokens": 675690360.0, + "step": 17711 + }, + { + "epoch": 2.2531484543951152, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6083840131759644, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8732799291610718, + "num_tokens": 675728794.0, + "step": 17712 + }, + { + "epoch": 2.2532756646737058, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5811772346496582, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8646835088729858, + "num_tokens": 675772514.0, + "step": 17713 + }, + { + "epoch": 2.2534028749522963, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.665959358215332, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8762205839157104, + "num_tokens": 675808072.0, + "step": 17714 + }, + { + "epoch": 2.253530085230887, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6942722797393799, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8647082448005676, + "num_tokens": 675846074.0, + "step": 17715 + }, + { + "epoch": 2.2536572955094774, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6023908853530884, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8798566460609436, + "num_tokens": 675888806.0, + "step": 17716 + }, + { + "epoch": 2.253784505788068, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6524276733398438, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8804410099983215, + "num_tokens": 675928077.0, + "step": 17717 + }, + { + "epoch": 2.253911716066658, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5524929761886597, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8804912567138672, + "num_tokens": 675964413.0, + "step": 17718 + }, + { + "epoch": 2.254038926345249, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7758437395095825, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8877366781234741, + "num_tokens": 675995741.0, + "step": 17719 + }, + { + "epoch": 2.254166136623839, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6604828834533691, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8727738857269287, + "num_tokens": 676033783.0, + "step": 17720 + }, + { + "epoch": 2.2542933469024296, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6797292232513428, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8834774494171143, + "num_tokens": 676069481.0, + "step": 17721 + }, + { + "epoch": 2.25442055718102, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6244137287139893, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8694167137145996, + "num_tokens": 676112916.0, + "step": 17722 + }, + { + "epoch": 2.2545477674596106, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5397388935089111, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8670052289962769, + "num_tokens": 676158714.0, + "step": 17723 + }, + { + "epoch": 2.254674977738201, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6659866571426392, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8822628855705261, + "num_tokens": 676193918.0, + "step": 17724 + }, + { + "epoch": 2.2548021880167917, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.668253779411316, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8870781660079956, + "num_tokens": 676229110.0, + "step": 17725 + }, + { + "epoch": 2.254929398295382, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.4827197790145874, + "learning_rate": 1e-06, + "loss": 0.3317, + "mean_token_accuracy": 0.893754780292511, + "num_tokens": 676274829.0, + "step": 17726 + }, + { + "epoch": 2.2550566085739727, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7143865823745728, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8788928985595703, + "num_tokens": 676313233.0, + "step": 17727 + }, + { + "epoch": 2.2551838188525632, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6585204601287842, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8699396252632141, + "num_tokens": 676351691.0, + "step": 17728 + }, + { + "epoch": 2.2553110291311538, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6881684064865112, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.869022786617279, + "num_tokens": 676390473.0, + "step": 17729 + }, + { + "epoch": 2.2554382394097443, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7025026082992554, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8678978085517883, + "num_tokens": 676429311.0, + "step": 17730 + }, + { + "epoch": 2.255565449688335, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5878218412399292, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8766604661941528, + "num_tokens": 676470526.0, + "step": 17731 + }, + { + "epoch": 2.2556926599669254, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6629937887191772, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8783039450645447, + "num_tokens": 676506988.0, + "step": 17732 + }, + { + "epoch": 2.255819870245516, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6647411584854126, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8740454912185669, + "num_tokens": 676541458.0, + "step": 17733 + }, + { + "epoch": 2.2559470805241064, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7103887796401978, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8950356841087341, + "num_tokens": 676574770.0, + "step": 17734 + }, + { + "epoch": 2.256074290802697, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6813609600067139, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8778995275497437, + "num_tokens": 676610011.0, + "step": 17735 + }, + { + "epoch": 2.2562015010812875, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6139787435531616, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.868001401424408, + "num_tokens": 676648961.0, + "step": 17736 + }, + { + "epoch": 2.256328711359878, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5711103677749634, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8801827430725098, + "num_tokens": 676689208.0, + "step": 17737 + }, + { + "epoch": 2.2564559216384685, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7663863897323608, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8751348853111267, + "num_tokens": 676728127.0, + "step": 17738 + }, + { + "epoch": 2.256583131917059, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.728186011314392, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8868948221206665, + "num_tokens": 676761586.0, + "step": 17739 + }, + { + "epoch": 2.2567103421956496, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7548311948776245, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8723387718200684, + "num_tokens": 676796027.0, + "step": 17740 + }, + { + "epoch": 2.2568375524742397, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7128238677978516, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8557316064834595, + "num_tokens": 676837870.0, + "step": 17741 + }, + { + "epoch": 2.2569647627528306, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5635842084884644, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8728394508361816, + "num_tokens": 676879315.0, + "step": 17742 + }, + { + "epoch": 2.2570919730314207, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5941420793533325, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8824811577796936, + "num_tokens": 676915324.0, + "step": 17743 + }, + { + "epoch": 2.2572191833100113, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5664433240890503, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8769710063934326, + "num_tokens": 676953084.0, + "step": 17744 + }, + { + "epoch": 2.257346393588602, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.562061071395874, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8751999735832214, + "num_tokens": 676997836.0, + "step": 17745 + }, + { + "epoch": 2.2574736038671923, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.798101782798767, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8848888874053955, + "num_tokens": 677032951.0, + "step": 17746 + }, + { + "epoch": 2.257600814145783, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.727669358253479, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8810025453567505, + "num_tokens": 677069447.0, + "step": 17747 + }, + { + "epoch": 2.2577280244243734, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6338286399841309, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8736699819564819, + "num_tokens": 677108884.0, + "step": 17748 + }, + { + "epoch": 2.257855234702964, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6859122514724731, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8715180158615112, + "num_tokens": 677151670.0, + "step": 17749 + }, + { + "epoch": 2.2579824449815544, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.613844633102417, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8719980716705322, + "num_tokens": 677192157.0, + "step": 17750 + }, + { + "epoch": 2.258109655260145, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5386475324630737, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8780593872070312, + "num_tokens": 677235173.0, + "step": 17751 + }, + { + "epoch": 2.2582368655387355, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.4871008396148682, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8744852542877197, + "num_tokens": 677281450.0, + "step": 17752 + }, + { + "epoch": 2.258364075817326, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5243582725524902, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8821743726730347, + "num_tokens": 677320937.0, + "step": 17753 + }, + { + "epoch": 2.2584912860959165, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7142432928085327, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8826802968978882, + "num_tokens": 677353007.0, + "step": 17754 + }, + { + "epoch": 2.258618496374507, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.4850358963012695, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8705329895019531, + "num_tokens": 677397634.0, + "step": 17755 + }, + { + "epoch": 2.2587457066530976, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7620841264724731, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8692259788513184, + "num_tokens": 677435545.0, + "step": 17756 + }, + { + "epoch": 2.258872916931688, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5755352973937988, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8808466196060181, + "num_tokens": 677479320.0, + "step": 17757 + }, + { + "epoch": 2.2590001272102787, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6328049898147583, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8750329613685608, + "num_tokens": 677518193.0, + "step": 17758 + }, + { + "epoch": 2.259127337488869, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6629456281661987, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8652178049087524, + "num_tokens": 677559020.0, + "step": 17759 + }, + { + "epoch": 2.2592545477674597, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7247533798217773, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8843868970870972, + "num_tokens": 677595222.0, + "step": 17760 + }, + { + "epoch": 2.2593817580460502, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7066508531570435, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8554218411445618, + "num_tokens": 677635014.0, + "step": 17761 + }, + { + "epoch": 2.2595089683246408, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6803653240203857, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8734711408615112, + "num_tokens": 677674507.0, + "step": 17762 + }, + { + "epoch": 2.2596361786032313, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5120211839675903, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8784188032150269, + "num_tokens": 677717074.0, + "step": 17763 + }, + { + "epoch": 2.259763388881822, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7416445016860962, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8833584189414978, + "num_tokens": 677750495.0, + "step": 17764 + }, + { + "epoch": 2.2598905991604123, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.690714955329895, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8855323195457458, + "num_tokens": 677784341.0, + "step": 17765 + }, + { + "epoch": 2.2600178094390024, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6352990865707397, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.877854585647583, + "num_tokens": 677821784.0, + "step": 17766 + }, + { + "epoch": 2.2601450197175934, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7274339199066162, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8743910789489746, + "num_tokens": 677865271.0, + "step": 17767 + }, + { + "epoch": 2.2602722299961835, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7244515419006348, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.874503493309021, + "num_tokens": 677902930.0, + "step": 17768 + }, + { + "epoch": 2.260399440274774, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7131518125534058, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8675887584686279, + "num_tokens": 677942326.0, + "step": 17769 + }, + { + "epoch": 2.2605266505533645, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.664792537689209, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8722094297409058, + "num_tokens": 677981915.0, + "step": 17770 + }, + { + "epoch": 2.260653860831955, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5777866840362549, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.880252480506897, + "num_tokens": 678018616.0, + "step": 17771 + }, + { + "epoch": 2.2607810711105456, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7896312475204468, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.872612476348877, + "num_tokens": 678051686.0, + "step": 17772 + }, + { + "epoch": 2.260908281389136, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.700606107711792, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8675071001052856, + "num_tokens": 678087376.0, + "step": 17773 + }, + { + "epoch": 2.2610354916677267, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8699449300765991, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8653543591499329, + "num_tokens": 678120505.0, + "step": 17774 + }, + { + "epoch": 2.261162701946317, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6534945964813232, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8796862959861755, + "num_tokens": 678156278.0, + "step": 17775 + }, + { + "epoch": 2.2612899122249077, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6632410287857056, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8757694363594055, + "num_tokens": 678193608.0, + "step": 17776 + }, + { + "epoch": 2.2614171225034982, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6345720291137695, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8888944983482361, + "num_tokens": 678231073.0, + "step": 17777 + }, + { + "epoch": 2.2615443327820888, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6675783395767212, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8690152168273926, + "num_tokens": 678268202.0, + "step": 17778 + }, + { + "epoch": 2.2616715430606793, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5065064430236816, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8869725465774536, + "num_tokens": 678309797.0, + "step": 17779 + }, + { + "epoch": 2.26179875333927, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.660361409187317, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8825200796127319, + "num_tokens": 678350305.0, + "step": 17780 + }, + { + "epoch": 2.2619259636178604, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6186130046844482, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8831227421760559, + "num_tokens": 678389270.0, + "step": 17781 + }, + { + "epoch": 2.262053173896451, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.626285433769226, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8678778409957886, + "num_tokens": 678430429.0, + "step": 17782 + }, + { + "epoch": 2.2621803841750414, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7188256978988647, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8828718662261963, + "num_tokens": 678462911.0, + "step": 17783 + }, + { + "epoch": 2.262307594453632, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.645537257194519, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8676116466522217, + "num_tokens": 678503946.0, + "step": 17784 + }, + { + "epoch": 2.2624348047322225, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.685027003288269, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8760676980018616, + "num_tokens": 678542685.0, + "step": 17785 + }, + { + "epoch": 2.262562015010813, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.529396891593933, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8672245144844055, + "num_tokens": 678588186.0, + "step": 17786 + }, + { + "epoch": 2.2626892252894035, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5896393060684204, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8780601024627686, + "num_tokens": 678627683.0, + "step": 17787 + }, + { + "epoch": 2.262816435567994, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.644856572151184, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8722621202468872, + "num_tokens": 678667772.0, + "step": 17788 + }, + { + "epoch": 2.2629436458465846, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.665000081062317, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8856631517410278, + "num_tokens": 678703886.0, + "step": 17789 + }, + { + "epoch": 2.263070856125175, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8053089380264282, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8748544454574585, + "num_tokens": 678736049.0, + "step": 17790 + }, + { + "epoch": 2.263198066403765, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6465849876403809, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8790810108184814, + "num_tokens": 678776106.0, + "step": 17791 + }, + { + "epoch": 2.263325276682356, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7824679613113403, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8671836853027344, + "num_tokens": 678811652.0, + "step": 17792 + }, + { + "epoch": 2.2634524869609463, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6439183950424194, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8618013858795166, + "num_tokens": 678849858.0, + "step": 17793 + }, + { + "epoch": 2.263579697239537, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6002888679504395, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8875647187232971, + "num_tokens": 678886431.0, + "step": 17794 + }, + { + "epoch": 2.2637069075181273, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.597589373588562, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8844454288482666, + "num_tokens": 678924664.0, + "step": 17795 + }, + { + "epoch": 2.263834117796718, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.615688443183899, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8681290149688721, + "num_tokens": 678967519.0, + "step": 17796 + }, + { + "epoch": 2.2639613280753084, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6706106662750244, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8795307874679565, + "num_tokens": 679004721.0, + "step": 17797 + }, + { + "epoch": 2.264088538353899, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.707092523574829, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8783155679702759, + "num_tokens": 679039578.0, + "step": 17798 + }, + { + "epoch": 2.2642157486324894, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.730377197265625, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8463988304138184, + "num_tokens": 679076300.0, + "step": 17799 + }, + { + "epoch": 2.26434295891108, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7216534614562988, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.864812970161438, + "num_tokens": 679114189.0, + "step": 17800 + }, + { + "epoch": 2.2644701691896705, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5429195165634155, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8772543668746948, + "num_tokens": 679154646.0, + "step": 17801 + }, + { + "epoch": 2.264597379468261, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.709920883178711, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8920428156852722, + "num_tokens": 679187154.0, + "step": 17802 + }, + { + "epoch": 2.2647245897468515, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6462277173995972, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8757822513580322, + "num_tokens": 679222396.0, + "step": 17803 + }, + { + "epoch": 2.264851800025442, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6473002433776855, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8813233375549316, + "num_tokens": 679259221.0, + "step": 17804 + }, + { + "epoch": 2.2649790103040326, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7593531608581543, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.874193549156189, + "num_tokens": 679294238.0, + "step": 17805 + }, + { + "epoch": 2.265106220582623, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7303552627563477, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8711064457893372, + "num_tokens": 679330610.0, + "step": 17806 + }, + { + "epoch": 2.2652334308612136, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5785703659057617, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8775888681411743, + "num_tokens": 679368652.0, + "step": 17807 + }, + { + "epoch": 2.265360641139804, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7689309120178223, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8527565598487854, + "num_tokens": 679403472.0, + "step": 17808 + }, + { + "epoch": 2.2654878514183947, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6740399599075317, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8675660490989685, + "num_tokens": 679442059.0, + "step": 17809 + }, + { + "epoch": 2.2656150616969852, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7901190519332886, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8786803483963013, + "num_tokens": 679475928.0, + "step": 17810 + }, + { + "epoch": 2.2657422719755758, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.631252408027649, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8693543672561646, + "num_tokens": 679516206.0, + "step": 17811 + }, + { + "epoch": 2.2658694822541663, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.707485556602478, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8816304206848145, + "num_tokens": 679549766.0, + "step": 17812 + }, + { + "epoch": 2.265996692532757, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6471928358078003, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8584192991256714, + "num_tokens": 679590660.0, + "step": 17813 + }, + { + "epoch": 2.2661239028113473, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7298336029052734, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8622885346412659, + "num_tokens": 679625939.0, + "step": 17814 + }, + { + "epoch": 2.266251113089938, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.707818627357483, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8730904459953308, + "num_tokens": 679662556.0, + "step": 17815 + }, + { + "epoch": 2.266378323368528, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7869354486465454, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8603556752204895, + "num_tokens": 679695452.0, + "step": 17816 + }, + { + "epoch": 2.266505533647119, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6716442108154297, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8702818155288696, + "num_tokens": 679732227.0, + "step": 17817 + }, + { + "epoch": 2.266632743925709, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6906167268753052, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8825323581695557, + "num_tokens": 679765730.0, + "step": 17818 + }, + { + "epoch": 2.2667599542042995, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6355383396148682, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8923646211624146, + "num_tokens": 679802169.0, + "step": 17819 + }, + { + "epoch": 2.26688716448289, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5214935541152954, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8856789469718933, + "num_tokens": 679843284.0, + "step": 17820 + }, + { + "epoch": 2.2670143747614806, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6003121137619019, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8752595782279968, + "num_tokens": 679882640.0, + "step": 17821 + }, + { + "epoch": 2.267141585040071, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6411502361297607, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8520380854606628, + "num_tokens": 679924404.0, + "step": 17822 + }, + { + "epoch": 2.2672687953186617, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8110089302062988, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8720618486404419, + "num_tokens": 679955969.0, + "step": 17823 + }, + { + "epoch": 2.267396005597252, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7139595746994019, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8832788467407227, + "num_tokens": 679990236.0, + "step": 17824 + }, + { + "epoch": 2.2675232158758427, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6228010654449463, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8814427852630615, + "num_tokens": 680024431.0, + "step": 17825 + }, + { + "epoch": 2.2676504261544332, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5637786388397217, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8860466480255127, + "num_tokens": 680063969.0, + "step": 17826 + }, + { + "epoch": 2.2677776364330238, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.9329642057418823, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8621476292610168, + "num_tokens": 680096352.0, + "step": 17827 + }, + { + "epoch": 2.2679048467116143, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7788196802139282, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.875016987323761, + "num_tokens": 680131924.0, + "step": 17828 + }, + { + "epoch": 2.268032056990205, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6636651754379272, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8758244514465332, + "num_tokens": 680170441.0, + "step": 17829 + }, + { + "epoch": 2.2681592672687954, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5789768695831299, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8653972744941711, + "num_tokens": 680215612.0, + "step": 17830 + }, + { + "epoch": 2.268286477547386, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 3.711799383163452, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8817191123962402, + "num_tokens": 680254696.0, + "step": 17831 + }, + { + "epoch": 2.2684136878259764, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8440277576446533, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8697769045829773, + "num_tokens": 680288978.0, + "step": 17832 + }, + { + "epoch": 2.268540898104567, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7751628160476685, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8695228695869446, + "num_tokens": 680322157.0, + "step": 17833 + }, + { + "epoch": 2.2686681083831575, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.744581699371338, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8733344078063965, + "num_tokens": 680356737.0, + "step": 17834 + }, + { + "epoch": 2.268795318661748, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.581815481185913, + "learning_rate": 1e-06, + "loss": 0.3181, + "mean_token_accuracy": 0.8962622880935669, + "num_tokens": 680392198.0, + "step": 17835 + }, + { + "epoch": 2.2689225289403385, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6541246175765991, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8736252188682556, + "num_tokens": 680429383.0, + "step": 17836 + }, + { + "epoch": 2.269049739218929, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7065232992172241, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8782724738121033, + "num_tokens": 680464187.0, + "step": 17837 + }, + { + "epoch": 2.2691769494975196, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8113616704940796, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8765443563461304, + "num_tokens": 680495013.0, + "step": 17838 + }, + { + "epoch": 2.2693041597761097, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.82989501953125, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.858729362487793, + "num_tokens": 680531278.0, + "step": 17839 + }, + { + "epoch": 2.2694313700547006, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6228652000427246, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8681811094284058, + "num_tokens": 680571469.0, + "step": 17840 + }, + { + "epoch": 2.2695585803332907, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6415648460388184, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8659571409225464, + "num_tokens": 680611876.0, + "step": 17841 + }, + { + "epoch": 2.2696857906118812, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6142092943191528, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8732231855392456, + "num_tokens": 680648444.0, + "step": 17842 + }, + { + "epoch": 2.2698130008904718, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6959329843521118, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8628695607185364, + "num_tokens": 680685747.0, + "step": 17843 + }, + { + "epoch": 2.2699402111690623, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.601830005645752, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8747519850730896, + "num_tokens": 680728302.0, + "step": 17844 + }, + { + "epoch": 2.270067421447653, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.546134352684021, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8845933079719543, + "num_tokens": 680766831.0, + "step": 17845 + }, + { + "epoch": 2.2701946317262434, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6042447090148926, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8899444341659546, + "num_tokens": 680803497.0, + "step": 17846 + }, + { + "epoch": 2.270321842004834, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6574132442474365, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8795337677001953, + "num_tokens": 680839498.0, + "step": 17847 + }, + { + "epoch": 2.2704490522834244, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6248725652694702, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.869682788848877, + "num_tokens": 680880174.0, + "step": 17848 + }, + { + "epoch": 2.270576262562015, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5887531042099, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8766635656356812, + "num_tokens": 680917537.0, + "step": 17849 + }, + { + "epoch": 2.2707034728406055, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6819852590560913, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8645734786987305, + "num_tokens": 680956681.0, + "step": 17850 + }, + { + "epoch": 2.270830683119196, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6388943195343018, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8733479976654053, + "num_tokens": 680993230.0, + "step": 17851 + }, + { + "epoch": 2.2709578933977865, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7015469074249268, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8710765838623047, + "num_tokens": 681027571.0, + "step": 17852 + }, + { + "epoch": 2.271085103676377, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8035961389541626, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.869549036026001, + "num_tokens": 681065043.0, + "step": 17853 + }, + { + "epoch": 2.2712123139549676, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.676304817199707, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8560395240783691, + "num_tokens": 681104235.0, + "step": 17854 + }, + { + "epoch": 2.271339524233558, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6853039264678955, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8785269260406494, + "num_tokens": 681139407.0, + "step": 17855 + }, + { + "epoch": 2.2714667345121486, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8034554719924927, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8742194771766663, + "num_tokens": 681172521.0, + "step": 17856 + }, + { + "epoch": 2.271593944790739, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7016736268997192, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8717529773712158, + "num_tokens": 681209148.0, + "step": 17857 + }, + { + "epoch": 2.2717211550693297, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.604554533958435, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8684909343719482, + "num_tokens": 681256239.0, + "step": 17858 + }, + { + "epoch": 2.2718483653479202, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6701345443725586, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8710892200469971, + "num_tokens": 681295192.0, + "step": 17859 + }, + { + "epoch": 2.2719755756265108, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6067348718643188, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8889239430427551, + "num_tokens": 681332533.0, + "step": 17860 + }, + { + "epoch": 2.2721027859051013, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7065401077270508, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8780866265296936, + "num_tokens": 681373255.0, + "step": 17861 + }, + { + "epoch": 2.272229996183692, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7301524877548218, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8687262535095215, + "num_tokens": 681406193.0, + "step": 17862 + }, + { + "epoch": 2.2723572064622823, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5358095169067383, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8912762403488159, + "num_tokens": 681444850.0, + "step": 17863 + }, + { + "epoch": 2.2724844167408724, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7585597038269043, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8912765979766846, + "num_tokens": 681476301.0, + "step": 17864 + }, + { + "epoch": 2.2726116270194634, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.584503173828125, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8750362396240234, + "num_tokens": 681517487.0, + "step": 17865 + }, + { + "epoch": 2.2727388372980535, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6200196743011475, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8684604167938232, + "num_tokens": 681556535.0, + "step": 17866 + }, + { + "epoch": 2.272866047576644, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5636613368988037, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8737157583236694, + "num_tokens": 681599232.0, + "step": 17867 + }, + { + "epoch": 2.2729932578552345, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6609961986541748, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8743724226951599, + "num_tokens": 681636957.0, + "step": 17868 + }, + { + "epoch": 2.273120468133825, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5758129358291626, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8800039291381836, + "num_tokens": 681674366.0, + "step": 17869 + }, + { + "epoch": 2.2732476784124156, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6574324369430542, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8715320229530334, + "num_tokens": 681713388.0, + "step": 17870 + }, + { + "epoch": 2.273374888691006, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6242928504943848, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8752850294113159, + "num_tokens": 681753485.0, + "step": 17871 + }, + { + "epoch": 2.2735020989695967, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6029072999954224, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.874419093132019, + "num_tokens": 681795948.0, + "step": 17872 + }, + { + "epoch": 2.273629309248187, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6986321210861206, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8771829009056091, + "num_tokens": 681833819.0, + "step": 17873 + }, + { + "epoch": 2.2737565195267777, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7847212553024292, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8732802271842957, + "num_tokens": 681868961.0, + "step": 17874 + }, + { + "epoch": 2.2738837298053682, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6504079103469849, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8778297305107117, + "num_tokens": 681906174.0, + "step": 17875 + }, + { + "epoch": 2.2740109400839588, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7315706014633179, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8702002763748169, + "num_tokens": 681944189.0, + "step": 17876 + }, + { + "epoch": 2.2741381503625493, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5916621685028076, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8821307420730591, + "num_tokens": 681983680.0, + "step": 17877 + }, + { + "epoch": 2.27426536064114, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6977969408035278, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8596827983856201, + "num_tokens": 682025767.0, + "step": 17878 + }, + { + "epoch": 2.2743925709197303, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6972533464431763, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8762887120246887, + "num_tokens": 682064266.0, + "step": 17879 + }, + { + "epoch": 2.274519781198321, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6333389282226562, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8605473637580872, + "num_tokens": 682105655.0, + "step": 17880 + }, + { + "epoch": 2.2746469914769114, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6662890911102295, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8853391408920288, + "num_tokens": 682139859.0, + "step": 17881 + }, + { + "epoch": 2.274774201755502, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5545092821121216, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8697317838668823, + "num_tokens": 682183238.0, + "step": 17882 + }, + { + "epoch": 2.2749014120340925, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5604921579360962, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8750284910202026, + "num_tokens": 682223273.0, + "step": 17883 + }, + { + "epoch": 2.275028622312683, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7127172946929932, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8720905780792236, + "num_tokens": 682260612.0, + "step": 17884 + }, + { + "epoch": 2.2751558325912735, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7391180992126465, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8606762886047363, + "num_tokens": 682301776.0, + "step": 17885 + }, + { + "epoch": 2.275283042869864, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6500593423843384, + "learning_rate": 1e-06, + "loss": 0.3238, + "mean_token_accuracy": 0.8943778276443481, + "num_tokens": 682336896.0, + "step": 17886 + }, + { + "epoch": 2.2754102531484546, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.9569586515426636, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8686270713806152, + "num_tokens": 682366228.0, + "step": 17887 + }, + { + "epoch": 2.275537463427045, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5606945753097534, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8794106245040894, + "num_tokens": 682408624.0, + "step": 17888 + }, + { + "epoch": 2.275664673705635, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5585558414459229, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.878212571144104, + "num_tokens": 682449304.0, + "step": 17889 + }, + { + "epoch": 2.275791883984226, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6443842649459839, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8810186982154846, + "num_tokens": 682486107.0, + "step": 17890 + }, + { + "epoch": 2.2759190942628162, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5627306699752808, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8624364137649536, + "num_tokens": 682531943.0, + "step": 17891 + }, + { + "epoch": 2.2760463045414068, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5968629121780396, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8841379880905151, + "num_tokens": 682572344.0, + "step": 17892 + }, + { + "epoch": 2.2761735148199973, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6473389863967896, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8741390705108643, + "num_tokens": 682612512.0, + "step": 17893 + }, + { + "epoch": 2.276300725098588, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6839613914489746, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8700190782546997, + "num_tokens": 682651449.0, + "step": 17894 + }, + { + "epoch": 2.2764279353771784, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5479941368103027, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8720219135284424, + "num_tokens": 682692578.0, + "step": 17895 + }, + { + "epoch": 2.276555145655769, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.4202182292938232, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8892835974693298, + "num_tokens": 682740413.0, + "step": 17896 + }, + { + "epoch": 2.2766823559343594, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5901163816452026, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8688241839408875, + "num_tokens": 682781397.0, + "step": 17897 + }, + { + "epoch": 2.27680956621295, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.624439001083374, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8705490827560425, + "num_tokens": 682822182.0, + "step": 17898 + }, + { + "epoch": 2.2769367764915405, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8005350828170776, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.875098466873169, + "num_tokens": 682856271.0, + "step": 17899 + }, + { + "epoch": 2.277063986770131, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5068342685699463, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8845044374465942, + "num_tokens": 682901440.0, + "step": 17900 + }, + { + "epoch": 2.2771911970487215, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5976413488388062, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8917140364646912, + "num_tokens": 682932634.0, + "step": 17901 + }, + { + "epoch": 2.277318407327312, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6523206233978271, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8624120950698853, + "num_tokens": 682972029.0, + "step": 17902 + }, + { + "epoch": 2.2774456176059026, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5875788927078247, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.8917312622070312, + "num_tokens": 683013142.0, + "step": 17903 + }, + { + "epoch": 2.277572827884493, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.767533540725708, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8791981339454651, + "num_tokens": 683049195.0, + "step": 17904 + }, + { + "epoch": 2.2777000381630836, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7189148664474487, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8706308603286743, + "num_tokens": 683085484.0, + "step": 17905 + }, + { + "epoch": 2.277827248441674, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7343326807022095, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.879044771194458, + "num_tokens": 683122811.0, + "step": 17906 + }, + { + "epoch": 2.2779544587202647, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8579498529434204, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8569069504737854, + "num_tokens": 683156838.0, + "step": 17907 + }, + { + "epoch": 2.2780816689988552, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7226659059524536, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8690581321716309, + "num_tokens": 683194179.0, + "step": 17908 + }, + { + "epoch": 2.2782088792774458, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.663974404335022, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8647356033325195, + "num_tokens": 683233591.0, + "step": 17909 + }, + { + "epoch": 2.2783360895560363, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5934048891067505, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8619542717933655, + "num_tokens": 683277376.0, + "step": 17910 + }, + { + "epoch": 2.278463299834627, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5280617475509644, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8844410181045532, + "num_tokens": 683315556.0, + "step": 17911 + }, + { + "epoch": 2.2785905101132173, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.58834969997406, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8767985701560974, + "num_tokens": 683356168.0, + "step": 17912 + }, + { + "epoch": 2.278717720391808, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6950136423110962, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8632279634475708, + "num_tokens": 683396018.0, + "step": 17913 + }, + { + "epoch": 2.278844930670398, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6398431062698364, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8745021820068359, + "num_tokens": 683435287.0, + "step": 17914 + }, + { + "epoch": 2.278972140948989, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5483713150024414, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8784781694412231, + "num_tokens": 683477835.0, + "step": 17915 + }, + { + "epoch": 2.279099351227579, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6790913343429565, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8665395975112915, + "num_tokens": 683516276.0, + "step": 17916 + }, + { + "epoch": 2.2792265615061695, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6447205543518066, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8772104382514954, + "num_tokens": 683555410.0, + "step": 17917 + }, + { + "epoch": 2.27935377178476, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.840523362159729, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8726121783256531, + "num_tokens": 683585236.0, + "step": 17918 + }, + { + "epoch": 2.2794809820633506, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6300088167190552, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8658832311630249, + "num_tokens": 683624996.0, + "step": 17919 + }, + { + "epoch": 2.279608192341941, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7545768022537231, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8784809708595276, + "num_tokens": 683657801.0, + "step": 17920 + }, + { + "epoch": 2.2797354026205316, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6541262865066528, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8739783763885498, + "num_tokens": 683696076.0, + "step": 17921 + }, + { + "epoch": 2.279862612899122, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6542829275131226, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8665300607681274, + "num_tokens": 683736001.0, + "step": 17922 + }, + { + "epoch": 2.2799898231777127, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7808842658996582, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.883271336555481, + "num_tokens": 683770422.0, + "step": 17923 + }, + { + "epoch": 2.2801170334563032, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7117938995361328, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8541801571846008, + "num_tokens": 683810389.0, + "step": 17924 + }, + { + "epoch": 2.2802442437348938, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7713481187820435, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.863010048866272, + "num_tokens": 683846532.0, + "step": 17925 + }, + { + "epoch": 2.2803714540134843, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6362303495407104, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8652505278587341, + "num_tokens": 683883246.0, + "step": 17926 + }, + { + "epoch": 2.280498664292075, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.757458209991455, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8675070405006409, + "num_tokens": 683923891.0, + "step": 17927 + }, + { + "epoch": 2.2806258745706653, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6273759603500366, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8775231242179871, + "num_tokens": 683964346.0, + "step": 17928 + }, + { + "epoch": 2.280753084849256, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6091676950454712, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8703248500823975, + "num_tokens": 684006434.0, + "step": 17929 + }, + { + "epoch": 2.2808802951278464, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5261071920394897, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8786876201629639, + "num_tokens": 684046967.0, + "step": 17930 + }, + { + "epoch": 2.281007505406437, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6357859373092651, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8684414625167847, + "num_tokens": 684084186.0, + "step": 17931 + }, + { + "epoch": 2.2811347156850275, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.692973256111145, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8871622085571289, + "num_tokens": 684119242.0, + "step": 17932 + }, + { + "epoch": 2.281261925963618, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.587603211402893, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8596622943878174, + "num_tokens": 684162141.0, + "step": 17933 + }, + { + "epoch": 2.2813891362422085, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6138170957565308, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8728906512260437, + "num_tokens": 684200148.0, + "step": 17934 + }, + { + "epoch": 2.281516346520799, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6616716384887695, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8803404569625854, + "num_tokens": 684235410.0, + "step": 17935 + }, + { + "epoch": 2.2816435567993896, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6414849758148193, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8798378109931946, + "num_tokens": 684273117.0, + "step": 17936 + }, + { + "epoch": 2.2817707670779797, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5959123373031616, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8877748250961304, + "num_tokens": 684309707.0, + "step": 17937 + }, + { + "epoch": 2.2818979773565706, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5187832117080688, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8844735622406006, + "num_tokens": 684355742.0, + "step": 17938 + }, + { + "epoch": 2.2820251876351607, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7368004322052002, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.86722731590271, + "num_tokens": 684393490.0, + "step": 17939 + }, + { + "epoch": 2.2821523979137512, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.607565999031067, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8741155862808228, + "num_tokens": 684431934.0, + "step": 17940 + }, + { + "epoch": 2.2822796081923418, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6116055250167847, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8729346990585327, + "num_tokens": 684469713.0, + "step": 17941 + }, + { + "epoch": 2.2824068184709323, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7950804233551025, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8748683929443359, + "num_tokens": 684502394.0, + "step": 17942 + }, + { + "epoch": 2.282534028749523, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6507898569107056, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8689863085746765, + "num_tokens": 684541582.0, + "step": 17943 + }, + { + "epoch": 2.2826612390281134, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 3.7339279651641846, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8713481426239014, + "num_tokens": 684581494.0, + "step": 17944 + }, + { + "epoch": 2.282788449306704, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7191166877746582, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8833956122398376, + "num_tokens": 684624294.0, + "step": 17945 + }, + { + "epoch": 2.2829156595852944, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6450592279434204, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8754040002822876, + "num_tokens": 684662574.0, + "step": 17946 + }, + { + "epoch": 2.283042869863885, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.8584429025650024, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8701664805412292, + "num_tokens": 684698354.0, + "step": 17947 + }, + { + "epoch": 2.2831700801424755, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.538115382194519, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8718185424804688, + "num_tokens": 684741965.0, + "step": 17948 + }, + { + "epoch": 2.283297290421066, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6581363677978516, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8844685554504395, + "num_tokens": 684781023.0, + "step": 17949 + }, + { + "epoch": 2.2834245006996565, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6785467863082886, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8743929862976074, + "num_tokens": 684821849.0, + "step": 17950 + }, + { + "epoch": 2.283551710978247, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7459849119186401, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8735499382019043, + "num_tokens": 684857341.0, + "step": 17951 + }, + { + "epoch": 2.2836789212568376, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7481023073196411, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8657931685447693, + "num_tokens": 684895101.0, + "step": 17952 + }, + { + "epoch": 2.283806131535428, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.9413645267486572, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.871384859085083, + "num_tokens": 684925231.0, + "step": 17953 + }, + { + "epoch": 2.2839333418140186, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.651606798171997, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8612239956855774, + "num_tokens": 684963548.0, + "step": 17954 + }, + { + "epoch": 2.284060552092609, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6461161375045776, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8924920558929443, + "num_tokens": 684998521.0, + "step": 17955 + }, + { + "epoch": 2.2841877623711997, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7352453470230103, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.868441104888916, + "num_tokens": 685032580.0, + "step": 17956 + }, + { + "epoch": 2.28431497264979, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7407042980194092, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8765599131584167, + "num_tokens": 685066827.0, + "step": 17957 + }, + { + "epoch": 2.2844421829283807, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7567602396011353, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8672896027565002, + "num_tokens": 685103326.0, + "step": 17958 + }, + { + "epoch": 2.2845693932069713, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6067540645599365, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8840138912200928, + "num_tokens": 685142896.0, + "step": 17959 + }, + { + "epoch": 2.284696603485562, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.655346155166626, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8681869506835938, + "num_tokens": 685183157.0, + "step": 17960 + }, + { + "epoch": 2.2848238137641523, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7600122690200806, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8710923194885254, + "num_tokens": 685216525.0, + "step": 17961 + }, + { + "epoch": 2.2849510240427424, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6027737855911255, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8832314014434814, + "num_tokens": 685253336.0, + "step": 17962 + }, + { + "epoch": 2.2850782343213334, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6219168901443481, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8712223768234253, + "num_tokens": 685292831.0, + "step": 17963 + }, + { + "epoch": 2.2852054445999235, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6126158237457275, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8737496733665466, + "num_tokens": 685332895.0, + "step": 17964 + }, + { + "epoch": 2.285332654878514, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5730594396591187, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8628715872764587, + "num_tokens": 685375251.0, + "step": 17965 + }, + { + "epoch": 2.2854598651571045, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5650969743728638, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8802935481071472, + "num_tokens": 685416787.0, + "step": 17966 + }, + { + "epoch": 2.285587075435695, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6822599172592163, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8878262042999268, + "num_tokens": 685454418.0, + "step": 17967 + }, + { + "epoch": 2.2857142857142856, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.8026288747787476, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8881857991218567, + "num_tokens": 685484809.0, + "step": 17968 + }, + { + "epoch": 2.285841495992876, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.64732825756073, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8770996332168579, + "num_tokens": 685522985.0, + "step": 17969 + }, + { + "epoch": 2.2859687062714666, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.7110198736190796, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8682861924171448, + "num_tokens": 685559107.0, + "step": 17970 + }, + { + "epoch": 2.286095916550057, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6311838626861572, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8830957412719727, + "num_tokens": 685603025.0, + "step": 17971 + }, + { + "epoch": 2.2862231268286477, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.9088016748428345, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8621051907539368, + "num_tokens": 685635783.0, + "step": 17972 + }, + { + "epoch": 2.2863503371072382, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.6121231317520142, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8738054037094116, + "num_tokens": 685676461.0, + "step": 17973 + }, + { + "epoch": 2.2864775473858288, + "ewc_loss": 2.7298927307128906e-05, + "grad_norm": 1.5951619148254395, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8744094371795654, + "num_tokens": 685716158.0, + "step": 17974 + }, + { + "epoch": 2.2866047576644193, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5752004384994507, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8879874348640442, + "num_tokens": 685754978.0, + "step": 17975 + }, + { + "epoch": 2.28673196794301, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6184715032577515, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8446645140647888, + "num_tokens": 685799566.0, + "step": 17976 + }, + { + "epoch": 2.2868591782216003, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6035953760147095, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8774974346160889, + "num_tokens": 685837087.0, + "step": 17977 + }, + { + "epoch": 2.286986388500191, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.710009217262268, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8686206936836243, + "num_tokens": 685872786.0, + "step": 17978 + }, + { + "epoch": 2.2871135987787814, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.586998701095581, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8833636045455933, + "num_tokens": 685912496.0, + "step": 17979 + }, + { + "epoch": 2.287240809057372, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5784169435501099, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8860371112823486, + "num_tokens": 685950442.0, + "step": 17980 + }, + { + "epoch": 2.2873680193359625, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6080539226531982, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8789976239204407, + "num_tokens": 685986885.0, + "step": 17981 + }, + { + "epoch": 2.287495229614553, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.605770468711853, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8828212022781372, + "num_tokens": 686025556.0, + "step": 17982 + }, + { + "epoch": 2.2876224398931435, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6454882621765137, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8844782114028931, + "num_tokens": 686061998.0, + "step": 17983 + }, + { + "epoch": 2.287749650171734, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.651037573814392, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8754202127456665, + "num_tokens": 686099256.0, + "step": 17984 + }, + { + "epoch": 2.2878768604503246, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7985904216766357, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8738877773284912, + "num_tokens": 686132053.0, + "step": 17985 + }, + { + "epoch": 2.288004070728915, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5838382244110107, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8765026330947876, + "num_tokens": 686170753.0, + "step": 17986 + }, + { + "epoch": 2.288131281007505, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5066728591918945, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8771218657493591, + "num_tokens": 686213851.0, + "step": 17987 + }, + { + "epoch": 2.288258491286096, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5698115825653076, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8670461177825928, + "num_tokens": 686256776.0, + "step": 17988 + }, + { + "epoch": 2.2883857015646862, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5266274213790894, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8742585182189941, + "num_tokens": 686296698.0, + "step": 17989 + }, + { + "epoch": 2.2885129118432768, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7896780967712402, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8646913766860962, + "num_tokens": 686333758.0, + "step": 17990 + }, + { + "epoch": 2.2886401221218673, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5249217748641968, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8775179386138916, + "num_tokens": 686376806.0, + "step": 17991 + }, + { + "epoch": 2.288767332400458, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5963704586029053, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8759907484054565, + "num_tokens": 686418497.0, + "step": 17992 + }, + { + "epoch": 2.2888945426790483, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6489646434783936, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8724862337112427, + "num_tokens": 686456791.0, + "step": 17993 + }, + { + "epoch": 2.289021752957639, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.807793140411377, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8755356073379517, + "num_tokens": 686491390.0, + "step": 17994 + }, + { + "epoch": 2.2891489632362294, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6002331972122192, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8866990804672241, + "num_tokens": 686529047.0, + "step": 17995 + }, + { + "epoch": 2.28927617351482, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6640372276306152, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8735657334327698, + "num_tokens": 686568657.0, + "step": 17996 + }, + { + "epoch": 2.2894033837934105, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5364105701446533, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8736511468887329, + "num_tokens": 686610738.0, + "step": 17997 + }, + { + "epoch": 2.289530594072001, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6221330165863037, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8747693300247192, + "num_tokens": 686648875.0, + "step": 17998 + }, + { + "epoch": 2.2896578043505915, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.629042625427246, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8833804130554199, + "num_tokens": 686685432.0, + "step": 17999 + }, + { + "epoch": 2.289785014629182, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6078499555587769, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.869225025177002, + "num_tokens": 686726011.0, + "step": 18000 + }, + { + "epoch": 2.2899122249077726, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6460628509521484, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.871933102607727, + "num_tokens": 686766579.0, + "step": 18001 + }, + { + "epoch": 2.290039435186363, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7082853317260742, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8755985498428345, + "num_tokens": 686803859.0, + "step": 18002 + }, + { + "epoch": 2.2901666454649536, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5610246658325195, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8781254291534424, + "num_tokens": 686844304.0, + "step": 18003 + }, + { + "epoch": 2.290293855743544, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.713517665863037, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8747415542602539, + "num_tokens": 686878562.0, + "step": 18004 + }, + { + "epoch": 2.2904210660221347, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6779065132141113, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.885418713092804, + "num_tokens": 686912075.0, + "step": 18005 + }, + { + "epoch": 2.290548276300725, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6266498565673828, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8649930953979492, + "num_tokens": 686955598.0, + "step": 18006 + }, + { + "epoch": 2.2906754865793157, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7953890562057495, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8664186000823975, + "num_tokens": 686994231.0, + "step": 18007 + }, + { + "epoch": 2.2908026968579063, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6988171339035034, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8760982155799866, + "num_tokens": 687029495.0, + "step": 18008 + }, + { + "epoch": 2.290929907136497, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5299420356750488, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8832941055297852, + "num_tokens": 687070618.0, + "step": 18009 + }, + { + "epoch": 2.2910571174150873, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6766728162765503, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8676100373268127, + "num_tokens": 687113842.0, + "step": 18010 + }, + { + "epoch": 2.291184327693678, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5902844667434692, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8705264329910278, + "num_tokens": 687152110.0, + "step": 18011 + }, + { + "epoch": 2.291311537972268, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5432099103927612, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8819385766983032, + "num_tokens": 687192193.0, + "step": 18012 + }, + { + "epoch": 2.291438748250859, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5339932441711426, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8792716264724731, + "num_tokens": 687234000.0, + "step": 18013 + }, + { + "epoch": 2.291565958529449, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6253970861434937, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8702399134635925, + "num_tokens": 687277577.0, + "step": 18014 + }, + { + "epoch": 2.2916931688080395, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6557027101516724, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8747451901435852, + "num_tokens": 687314570.0, + "step": 18015 + }, + { + "epoch": 2.29182037908663, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7708946466445923, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.8906443119049072, + "num_tokens": 687348431.0, + "step": 18016 + }, + { + "epoch": 2.2919475893652206, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.589381456375122, + "learning_rate": 1e-06, + "loss": 0.3564, + "mean_token_accuracy": 0.8841745853424072, + "num_tokens": 687388860.0, + "step": 18017 + }, + { + "epoch": 2.292074799643811, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6992549896240234, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8830215334892273, + "num_tokens": 687425773.0, + "step": 18018 + }, + { + "epoch": 2.2922020099224016, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.753352403640747, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.880897581577301, + "num_tokens": 687460691.0, + "step": 18019 + }, + { + "epoch": 2.292329220200992, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.63188636302948, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8797377347946167, + "num_tokens": 687497309.0, + "step": 18020 + }, + { + "epoch": 2.2924564304795827, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6447252035140991, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.868700385093689, + "num_tokens": 687535432.0, + "step": 18021 + }, + { + "epoch": 2.2925836407581732, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7768560647964478, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8459908962249756, + "num_tokens": 687571353.0, + "step": 18022 + }, + { + "epoch": 2.2927108510367638, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.871208667755127, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8747337460517883, + "num_tokens": 687605246.0, + "step": 18023 + }, + { + "epoch": 2.2928380613153543, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7064257860183716, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8561400175094604, + "num_tokens": 687646346.0, + "step": 18024 + }, + { + "epoch": 2.292965271593945, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5453516244888306, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8702524900436401, + "num_tokens": 687689795.0, + "step": 18025 + }, + { + "epoch": 2.2930924818725353, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6458580493927002, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8719009757041931, + "num_tokens": 687725401.0, + "step": 18026 + }, + { + "epoch": 2.293219692151126, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5424175262451172, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8711646199226379, + "num_tokens": 687768704.0, + "step": 18027 + }, + { + "epoch": 2.2933469024297164, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7092238664627075, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8836995363235474, + "num_tokens": 687806929.0, + "step": 18028 + }, + { + "epoch": 2.293474112708307, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7284337282180786, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8733977675437927, + "num_tokens": 687843087.0, + "step": 18029 + }, + { + "epoch": 2.2936013229868975, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7053968906402588, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8609897494316101, + "num_tokens": 687883369.0, + "step": 18030 + }, + { + "epoch": 2.293728533265488, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5736701488494873, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8862811923027039, + "num_tokens": 687919639.0, + "step": 18031 + }, + { + "epoch": 2.2938557435440785, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5011305809020996, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8828401565551758, + "num_tokens": 687965047.0, + "step": 18032 + }, + { + "epoch": 2.293982953822669, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.8852295875549316, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8619022965431213, + "num_tokens": 687999871.0, + "step": 18033 + }, + { + "epoch": 2.2941101641012596, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5806385278701782, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.884686291217804, + "num_tokens": 688036006.0, + "step": 18034 + }, + { + "epoch": 2.2942373743798496, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.7125405073165894, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.860297441482544, + "num_tokens": 688074350.0, + "step": 18035 + }, + { + "epoch": 2.2943645846584406, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.5856729745864868, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8771680593490601, + "num_tokens": 688113031.0, + "step": 18036 + }, + { + "epoch": 2.2944917949370307, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6031694412231445, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8701293468475342, + "num_tokens": 688152046.0, + "step": 18037 + }, + { + "epoch": 2.2946190052156212, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.888661503791809, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.870863676071167, + "num_tokens": 688182036.0, + "step": 18038 + }, + { + "epoch": 2.2947462154942118, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.6349481344223022, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8734211921691895, + "num_tokens": 688221250.0, + "step": 18039 + }, + { + "epoch": 2.2948734257728023, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.731094241142273, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8669571876525879, + "num_tokens": 688262307.0, + "step": 18040 + }, + { + "epoch": 2.295000636051393, + "ewc_loss": 2.7418136596679688e-05, + "grad_norm": 1.590106725692749, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8883375525474548, + "num_tokens": 688300670.0, + "step": 18041 + }, + { + "epoch": 2.2951278463299833, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8768173456192017, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8688994646072388, + "num_tokens": 688334702.0, + "step": 18042 + }, + { + "epoch": 2.295255056608574, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5484917163848877, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8872102499008179, + "num_tokens": 688373847.0, + "step": 18043 + }, + { + "epoch": 2.2953822668871644, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5511596202850342, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8771601319313049, + "num_tokens": 688415234.0, + "step": 18044 + }, + { + "epoch": 2.295509477165755, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6987080574035645, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8596959710121155, + "num_tokens": 688453120.0, + "step": 18045 + }, + { + "epoch": 2.2956366874443455, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7762426137924194, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8793454170227051, + "num_tokens": 688487916.0, + "step": 18046 + }, + { + "epoch": 2.295763897722936, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5154401063919067, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8852580785751343, + "num_tokens": 688532472.0, + "step": 18047 + }, + { + "epoch": 2.2958911080015265, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.739133596420288, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.884206235408783, + "num_tokens": 688565369.0, + "step": 18048 + }, + { + "epoch": 2.296018318280117, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6030442714691162, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8800071477890015, + "num_tokens": 688600574.0, + "step": 18049 + }, + { + "epoch": 2.2961455285587076, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.68988037109375, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.888334333896637, + "num_tokens": 688633983.0, + "step": 18050 + }, + { + "epoch": 2.296272738837298, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8175514936447144, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8762989640235901, + "num_tokens": 688672870.0, + "step": 18051 + }, + { + "epoch": 2.2963999491158886, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.764194130897522, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8676287531852722, + "num_tokens": 688707252.0, + "step": 18052 + }, + { + "epoch": 2.296527159394479, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7799006700515747, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8766751885414124, + "num_tokens": 688740524.0, + "step": 18053 + }, + { + "epoch": 2.2966543696730697, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7844619750976562, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8695208430290222, + "num_tokens": 688775182.0, + "step": 18054 + }, + { + "epoch": 2.29678157995166, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6535389423370361, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8813068270683289, + "num_tokens": 688812370.0, + "step": 18055 + }, + { + "epoch": 2.2969087902302507, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6707149744033813, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8752028942108154, + "num_tokens": 688850529.0, + "step": 18056 + }, + { + "epoch": 2.2970360005088413, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.880592942237854, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8686376214027405, + "num_tokens": 688885093.0, + "step": 18057 + }, + { + "epoch": 2.297163210787432, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7333977222442627, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8764257431030273, + "num_tokens": 688919650.0, + "step": 18058 + }, + { + "epoch": 2.2972904210660223, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6690690517425537, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8837388753890991, + "num_tokens": 688959680.0, + "step": 18059 + }, + { + "epoch": 2.2974176313446124, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7360161542892456, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8778354525566101, + "num_tokens": 688998005.0, + "step": 18060 + }, + { + "epoch": 2.2975448416232034, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.538963794708252, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8811154365539551, + "num_tokens": 689038354.0, + "step": 18061 + }, + { + "epoch": 2.2976720519017935, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6153509616851807, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8752838373184204, + "num_tokens": 689076813.0, + "step": 18062 + }, + { + "epoch": 2.297799262180384, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7122657299041748, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8796180486679077, + "num_tokens": 689115314.0, + "step": 18063 + }, + { + "epoch": 2.2979264724589745, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7113988399505615, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8727586269378662, + "num_tokens": 689152888.0, + "step": 18064 + }, + { + "epoch": 2.298053682737565, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7123379707336426, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8641130328178406, + "num_tokens": 689192949.0, + "step": 18065 + }, + { + "epoch": 2.2981808930161556, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6004976034164429, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8767433762550354, + "num_tokens": 689234931.0, + "step": 18066 + }, + { + "epoch": 2.298308103294746, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6492167711257935, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.873550295829773, + "num_tokens": 689275834.0, + "step": 18067 + }, + { + "epoch": 2.2984353135733366, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.681922197341919, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8856902122497559, + "num_tokens": 689311891.0, + "step": 18068 + }, + { + "epoch": 2.298562523851927, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8724721670150757, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.864412784576416, + "num_tokens": 689345812.0, + "step": 18069 + }, + { + "epoch": 2.2986897341305177, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7273054122924805, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8891476392745972, + "num_tokens": 689377150.0, + "step": 18070 + }, + { + "epoch": 2.298816944409108, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6231244802474976, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8794398307800293, + "num_tokens": 689414366.0, + "step": 18071 + }, + { + "epoch": 2.2989441546876987, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6429857015609741, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8493927121162415, + "num_tokens": 689458965.0, + "step": 18072 + }, + { + "epoch": 2.2990713649662893, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7154548168182373, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8800144195556641, + "num_tokens": 689494311.0, + "step": 18073 + }, + { + "epoch": 2.29919857524488, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8528552055358887, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8771728277206421, + "num_tokens": 689528379.0, + "step": 18074 + }, + { + "epoch": 2.2993257855234703, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.587706208229065, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8883322477340698, + "num_tokens": 689563421.0, + "step": 18075 + }, + { + "epoch": 2.299452995802061, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8213618993759155, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8653558492660522, + "num_tokens": 689597568.0, + "step": 18076 + }, + { + "epoch": 2.2995802060806514, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.697373867034912, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8739141821861267, + "num_tokens": 689634768.0, + "step": 18077 + }, + { + "epoch": 2.299707416359242, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.581710696220398, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8951588273048401, + "num_tokens": 689672422.0, + "step": 18078 + }, + { + "epoch": 2.2998346266378324, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5330393314361572, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8814267516136169, + "num_tokens": 689713925.0, + "step": 18079 + }, + { + "epoch": 2.299961836916423, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6617995500564575, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8802621364593506, + "num_tokens": 689747816.0, + "step": 18080 + }, + { + "epoch": 2.3000890471950135, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6466457843780518, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8868018984794617, + "num_tokens": 689782978.0, + "step": 18081 + }, + { + "epoch": 2.300216257473604, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.629993200302124, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8679406642913818, + "num_tokens": 689823975.0, + "step": 18082 + }, + { + "epoch": 2.3003434677521946, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.767340898513794, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8763843774795532, + "num_tokens": 689855599.0, + "step": 18083 + }, + { + "epoch": 2.300470678030785, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7287498712539673, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.867739737033844, + "num_tokens": 689891018.0, + "step": 18084 + }, + { + "epoch": 2.300597888309375, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7530617713928223, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8858773112297058, + "num_tokens": 689923756.0, + "step": 18085 + }, + { + "epoch": 2.300725098587966, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6418566703796387, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8751286864280701, + "num_tokens": 689961517.0, + "step": 18086 + }, + { + "epoch": 2.3008523088665562, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6705032587051392, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8819390535354614, + "num_tokens": 689997987.0, + "step": 18087 + }, + { + "epoch": 2.3009795191451468, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6427080631256104, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8639353513717651, + "num_tokens": 690041836.0, + "step": 18088 + }, + { + "epoch": 2.3011067294237373, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7664769887924194, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8583523035049438, + "num_tokens": 690076792.0, + "step": 18089 + }, + { + "epoch": 2.301233939702328, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6628810167312622, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8684249520301819, + "num_tokens": 690116776.0, + "step": 18090 + }, + { + "epoch": 2.3013611499809183, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 3.690251588821411, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8809279203414917, + "num_tokens": 690152716.0, + "step": 18091 + }, + { + "epoch": 2.301488360259509, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6801950931549072, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8683059811592102, + "num_tokens": 690193416.0, + "step": 18092 + }, + { + "epoch": 2.3016155705380994, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.706320881843567, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8702396154403687, + "num_tokens": 690236900.0, + "step": 18093 + }, + { + "epoch": 2.30174278081669, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5810710191726685, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8821582794189453, + "num_tokens": 690279945.0, + "step": 18094 + }, + { + "epoch": 2.3018699910952805, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6428124904632568, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8749260306358337, + "num_tokens": 690316086.0, + "step": 18095 + }, + { + "epoch": 2.301997201373871, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.670088529586792, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8806501626968384, + "num_tokens": 690352062.0, + "step": 18096 + }, + { + "epoch": 2.3021244116524615, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7004183530807495, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8725487589836121, + "num_tokens": 690388002.0, + "step": 18097 + }, + { + "epoch": 2.302251621931052, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.606703281402588, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8652756214141846, + "num_tokens": 690430523.0, + "step": 18098 + }, + { + "epoch": 2.3023788322096426, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6211200952529907, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8699060082435608, + "num_tokens": 690469076.0, + "step": 18099 + }, + { + "epoch": 2.302506042488233, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.639946699142456, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8664449453353882, + "num_tokens": 690509237.0, + "step": 18100 + }, + { + "epoch": 2.3026332527668236, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5613850355148315, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8738574981689453, + "num_tokens": 690550664.0, + "step": 18101 + }, + { + "epoch": 2.302760463045414, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6951024532318115, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8875355124473572, + "num_tokens": 690582529.0, + "step": 18102 + }, + { + "epoch": 2.3028876733240047, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5195752382278442, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8860486745834351, + "num_tokens": 690622097.0, + "step": 18103 + }, + { + "epoch": 2.303014883602595, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.731956124305725, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8815982341766357, + "num_tokens": 690654813.0, + "step": 18104 + }, + { + "epoch": 2.3031420938811857, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6751813888549805, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8661003708839417, + "num_tokens": 690691971.0, + "step": 18105 + }, + { + "epoch": 2.3032693041597763, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6804072856903076, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8742671608924866, + "num_tokens": 690726695.0, + "step": 18106 + }, + { + "epoch": 2.303396514438367, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.859281063079834, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8895490765571594, + "num_tokens": 690760528.0, + "step": 18107 + }, + { + "epoch": 2.3035237247169573, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.62100350856781, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8703422546386719, + "num_tokens": 690802204.0, + "step": 18108 + }, + { + "epoch": 2.303650934995548, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.507979154586792, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8697259426116943, + "num_tokens": 690847059.0, + "step": 18109 + }, + { + "epoch": 2.303778145274138, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6025404930114746, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8825855255126953, + "num_tokens": 690882110.0, + "step": 18110 + }, + { + "epoch": 2.303905355552729, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8024792671203613, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8845584392547607, + "num_tokens": 690914578.0, + "step": 18111 + }, + { + "epoch": 2.304032565831319, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5907118320465088, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.874380350112915, + "num_tokens": 690954623.0, + "step": 18112 + }, + { + "epoch": 2.3041597761099095, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.55495023727417, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8892987966537476, + "num_tokens": 690994835.0, + "step": 18113 + }, + { + "epoch": 2.3042869863885, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5988160371780396, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.868678629398346, + "num_tokens": 691034878.0, + "step": 18114 + }, + { + "epoch": 2.3044141966670906, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6269357204437256, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8734662532806396, + "num_tokens": 691072900.0, + "step": 18115 + }, + { + "epoch": 2.304541406945681, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.558135747909546, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8824282884597778, + "num_tokens": 691112191.0, + "step": 18116 + }, + { + "epoch": 2.3046686172242716, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6939738988876343, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8810946941375732, + "num_tokens": 691147821.0, + "step": 18117 + }, + { + "epoch": 2.304795827502862, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7273786067962646, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.881071925163269, + "num_tokens": 691183964.0, + "step": 18118 + }, + { + "epoch": 2.3049230377814527, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6988343000411987, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8770501613616943, + "num_tokens": 691220393.0, + "step": 18119 + }, + { + "epoch": 2.305050248060043, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.695959448814392, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8771082758903503, + "num_tokens": 691258283.0, + "step": 18120 + }, + { + "epoch": 2.3051774583386337, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6736946105957031, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8767169713973999, + "num_tokens": 691297066.0, + "step": 18121 + }, + { + "epoch": 2.3053046686172243, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6678063869476318, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8768513202667236, + "num_tokens": 691338043.0, + "step": 18122 + }, + { + "epoch": 2.305431878895815, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6162045001983643, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8752549886703491, + "num_tokens": 691379033.0, + "step": 18123 + }, + { + "epoch": 2.3055590891744053, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5077123641967773, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.875486433506012, + "num_tokens": 691423152.0, + "step": 18124 + }, + { + "epoch": 2.305686299452996, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6814992427825928, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.876213550567627, + "num_tokens": 691464906.0, + "step": 18125 + }, + { + "epoch": 2.3058135097315864, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7337491512298584, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8713380098342896, + "num_tokens": 691499234.0, + "step": 18126 + }, + { + "epoch": 2.305940720010177, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.546079158782959, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8727604150772095, + "num_tokens": 691539376.0, + "step": 18127 + }, + { + "epoch": 2.3060679302887674, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6574288606643677, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.86695396900177, + "num_tokens": 691579115.0, + "step": 18128 + }, + { + "epoch": 2.306195140567358, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6028586626052856, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8802735805511475, + "num_tokens": 691614376.0, + "step": 18129 + }, + { + "epoch": 2.3063223508459485, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6032367944717407, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8735171556472778, + "num_tokens": 691650967.0, + "step": 18130 + }, + { + "epoch": 2.306449561124539, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6883429288864136, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8763858675956726, + "num_tokens": 691690443.0, + "step": 18131 + }, + { + "epoch": 2.3065767714031296, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.707740068435669, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8817084431648254, + "num_tokens": 691726157.0, + "step": 18132 + }, + { + "epoch": 2.3067039816817196, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6468698978424072, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8810712099075317, + "num_tokens": 691763879.0, + "step": 18133 + }, + { + "epoch": 2.3068311919603106, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5303772687911987, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8674225807189941, + "num_tokens": 691811499.0, + "step": 18134 + }, + { + "epoch": 2.3069584022389007, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7238696813583374, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8761353492736816, + "num_tokens": 691847174.0, + "step": 18135 + }, + { + "epoch": 2.3070856125174912, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6798211336135864, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8786987662315369, + "num_tokens": 691887819.0, + "step": 18136 + }, + { + "epoch": 2.3072128227960818, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6538738012313843, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8612123727798462, + "num_tokens": 691931205.0, + "step": 18137 + }, + { + "epoch": 2.3073400330746723, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6051822900772095, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8734399080276489, + "num_tokens": 691969625.0, + "step": 18138 + }, + { + "epoch": 2.307467243353263, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5797827243804932, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8743403553962708, + "num_tokens": 692010512.0, + "step": 18139 + }, + { + "epoch": 2.3075944536318533, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 3.704446315765381, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8788872957229614, + "num_tokens": 692049801.0, + "step": 18140 + }, + { + "epoch": 2.307721663910444, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6392227411270142, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8711458444595337, + "num_tokens": 692088460.0, + "step": 18141 + }, + { + "epoch": 2.3078488741890344, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6393276453018188, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8833961486816406, + "num_tokens": 692124027.0, + "step": 18142 + }, + { + "epoch": 2.307976084467625, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7398808002471924, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8644227981567383, + "num_tokens": 692160286.0, + "step": 18143 + }, + { + "epoch": 2.3081032947462155, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6272127628326416, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8721635341644287, + "num_tokens": 692197312.0, + "step": 18144 + }, + { + "epoch": 2.308230505024806, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6250909566879272, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8675920963287354, + "num_tokens": 692236238.0, + "step": 18145 + }, + { + "epoch": 2.3083577153033965, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6619226932525635, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8538546562194824, + "num_tokens": 692278758.0, + "step": 18146 + }, + { + "epoch": 2.308484925581987, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6203209161758423, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8706017732620239, + "num_tokens": 692317180.0, + "step": 18147 + }, + { + "epoch": 2.3086121358605776, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.783656358718872, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8868957757949829, + "num_tokens": 692348571.0, + "step": 18148 + }, + { + "epoch": 2.308739346139168, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8224939107894897, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8738953471183777, + "num_tokens": 692386060.0, + "step": 18149 + }, + { + "epoch": 2.3088665564177586, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.68231999874115, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8724481463432312, + "num_tokens": 692423999.0, + "step": 18150 + }, + { + "epoch": 2.308993766696349, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6570817232131958, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8731282353401184, + "num_tokens": 692460001.0, + "step": 18151 + }, + { + "epoch": 2.3091209769749397, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6726658344268799, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8739989995956421, + "num_tokens": 692496966.0, + "step": 18152 + }, + { + "epoch": 2.30924818725353, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7730891704559326, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8650252819061279, + "num_tokens": 692532680.0, + "step": 18153 + }, + { + "epoch": 2.3093753975321207, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6684924364089966, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.87010657787323, + "num_tokens": 692572256.0, + "step": 18154 + }, + { + "epoch": 2.3095026078107113, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6329562664031982, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8615055680274963, + "num_tokens": 692614709.0, + "step": 18155 + }, + { + "epoch": 2.309629818089302, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.9637030363082886, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8716416358947754, + "num_tokens": 692643747.0, + "step": 18156 + }, + { + "epoch": 2.3097570283678923, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6066354513168335, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8763985633850098, + "num_tokens": 692682037.0, + "step": 18157 + }, + { + "epoch": 2.3098842386464824, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6875001192092896, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8598883152008057, + "num_tokens": 692721900.0, + "step": 18158 + }, + { + "epoch": 2.3100114489250734, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8357826471328735, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8735065460205078, + "num_tokens": 692756725.0, + "step": 18159 + }, + { + "epoch": 2.3101386592036635, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5847651958465576, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8709732890129089, + "num_tokens": 692799419.0, + "step": 18160 + }, + { + "epoch": 2.310265869482254, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6100623607635498, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8804824352264404, + "num_tokens": 692840705.0, + "step": 18161 + }, + { + "epoch": 2.3103930797608445, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.9582048654556274, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8610736131668091, + "num_tokens": 692872298.0, + "step": 18162 + }, + { + "epoch": 2.310520290039435, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6232103109359741, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8817696571350098, + "num_tokens": 692909763.0, + "step": 18163 + }, + { + "epoch": 2.3106475003180256, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6582919359207153, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8566012978553772, + "num_tokens": 692950010.0, + "step": 18164 + }, + { + "epoch": 2.310774710596616, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7700985670089722, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8622064590454102, + "num_tokens": 692988366.0, + "step": 18165 + }, + { + "epoch": 2.3109019208752066, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5167131423950195, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8783774375915527, + "num_tokens": 693034533.0, + "step": 18166 + }, + { + "epoch": 2.311029131153797, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6551300287246704, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8683674335479736, + "num_tokens": 693075552.0, + "step": 18167 + }, + { + "epoch": 2.3111563414323877, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6018272638320923, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8586405515670776, + "num_tokens": 693119503.0, + "step": 18168 + }, + { + "epoch": 2.311283551710978, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6866917610168457, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8768572807312012, + "num_tokens": 693157255.0, + "step": 18169 + }, + { + "epoch": 2.3114107619895687, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7766263484954834, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8588789701461792, + "num_tokens": 693195031.0, + "step": 18170 + }, + { + "epoch": 2.3115379722681593, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.723353624343872, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8756054639816284, + "num_tokens": 693229741.0, + "step": 18171 + }, + { + "epoch": 2.31166518254675, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5876256227493286, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8935161232948303, + "num_tokens": 693266744.0, + "step": 18172 + }, + { + "epoch": 2.3117923928253403, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7443838119506836, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8798061609268188, + "num_tokens": 693299880.0, + "step": 18173 + }, + { + "epoch": 2.311919603103931, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7098599672317505, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8786988854408264, + "num_tokens": 693339229.0, + "step": 18174 + }, + { + "epoch": 2.3120468133825214, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.4512234926223755, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8883974552154541, + "num_tokens": 693385573.0, + "step": 18175 + }, + { + "epoch": 2.312174023661112, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 3.821584463119507, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8773813247680664, + "num_tokens": 693421317.0, + "step": 18176 + }, + { + "epoch": 2.3123012339397024, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6686326265335083, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8843448758125305, + "num_tokens": 693455792.0, + "step": 18177 + }, + { + "epoch": 2.312428444218293, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7453283071517944, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8690835237503052, + "num_tokens": 693496022.0, + "step": 18178 + }, + { + "epoch": 2.3125556544968835, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7799159288406372, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8769086599349976, + "num_tokens": 693528280.0, + "step": 18179 + }, + { + "epoch": 2.312682864775474, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7288060188293457, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8753659725189209, + "num_tokens": 693566046.0, + "step": 18180 + }, + { + "epoch": 2.3128100750540646, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7245371341705322, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8756074905395508, + "num_tokens": 693600197.0, + "step": 18181 + }, + { + "epoch": 2.312937285332655, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7699328660964966, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8639262318611145, + "num_tokens": 693635546.0, + "step": 18182 + }, + { + "epoch": 2.313064495611245, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7039800882339478, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.851129949092865, + "num_tokens": 693678964.0, + "step": 18183 + }, + { + "epoch": 2.313191705889836, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5588144063949585, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8723090887069702, + "num_tokens": 693720919.0, + "step": 18184 + }, + { + "epoch": 2.313318916168426, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5058292150497437, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.878870964050293, + "num_tokens": 693762756.0, + "step": 18185 + }, + { + "epoch": 2.3134461264470167, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5233184099197388, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8873468637466431, + "num_tokens": 693803722.0, + "step": 18186 + }, + { + "epoch": 2.3135733367256073, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5953004360198975, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8764010667800903, + "num_tokens": 693845057.0, + "step": 18187 + }, + { + "epoch": 2.313700547004198, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7254533767700195, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8828533291816711, + "num_tokens": 693881549.0, + "step": 18188 + }, + { + "epoch": 2.3138277572827883, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.534250259399414, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8788130283355713, + "num_tokens": 693920736.0, + "step": 18189 + }, + { + "epoch": 2.313954967561379, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.60641348361969, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8728034496307373, + "num_tokens": 693961431.0, + "step": 18190 + }, + { + "epoch": 2.3140821778399694, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6386775970458984, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.858673095703125, + "num_tokens": 694001545.0, + "step": 18191 + }, + { + "epoch": 2.31420938811856, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6099073886871338, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8711060881614685, + "num_tokens": 694044457.0, + "step": 18192 + }, + { + "epoch": 2.3143365983971504, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6134650707244873, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8681796789169312, + "num_tokens": 694085646.0, + "step": 18193 + }, + { + "epoch": 2.314463808675741, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6450769901275635, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8889204859733582, + "num_tokens": 694125446.0, + "step": 18194 + }, + { + "epoch": 2.3145910189543315, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6082147359848022, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8822718858718872, + "num_tokens": 694163317.0, + "step": 18195 + }, + { + "epoch": 2.314718229232922, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.9572765827178955, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8726122379302979, + "num_tokens": 694201605.0, + "step": 18196 + }, + { + "epoch": 2.3148454395115126, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5976216793060303, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.8865689039230347, + "num_tokens": 694239758.0, + "step": 18197 + }, + { + "epoch": 2.314972649790103, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7939786911010742, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8512535095214844, + "num_tokens": 694276271.0, + "step": 18198 + }, + { + "epoch": 2.3150998600686936, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6190297603607178, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8656044006347656, + "num_tokens": 694317082.0, + "step": 18199 + }, + { + "epoch": 2.315227070347284, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.641418695449829, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8775216341018677, + "num_tokens": 694356458.0, + "step": 18200 + }, + { + "epoch": 2.3153542806258747, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.648861050605774, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8734406232833862, + "num_tokens": 694399297.0, + "step": 18201 + }, + { + "epoch": 2.315481490904465, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6376451253890991, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8787512183189392, + "num_tokens": 694437380.0, + "step": 18202 + }, + { + "epoch": 2.3156087011830557, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6890593767166138, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8696166276931763, + "num_tokens": 694477959.0, + "step": 18203 + }, + { + "epoch": 2.3157359114616463, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5214248895645142, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8679962158203125, + "num_tokens": 694522976.0, + "step": 18204 + }, + { + "epoch": 2.315863121740237, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6931231021881104, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.88380366563797, + "num_tokens": 694558695.0, + "step": 18205 + }, + { + "epoch": 2.3159903320188273, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6534959077835083, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8815001249313354, + "num_tokens": 694596251.0, + "step": 18206 + }, + { + "epoch": 2.316117542297418, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7519865036010742, + "learning_rate": 1e-06, + "loss": 0.3295, + "mean_token_accuracy": 0.8934823870658875, + "num_tokens": 694632620.0, + "step": 18207 + }, + { + "epoch": 2.316244752576008, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.551236867904663, + "learning_rate": 1e-06, + "loss": 0.3479, + "mean_token_accuracy": 0.8861907124519348, + "num_tokens": 694671955.0, + "step": 18208 + }, + { + "epoch": 2.316371962854599, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7350635528564453, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8771349191665649, + "num_tokens": 694703409.0, + "step": 18209 + }, + { + "epoch": 2.316499173133189, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.627846121788025, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8687736988067627, + "num_tokens": 694745959.0, + "step": 18210 + }, + { + "epoch": 2.3166263834117795, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.7008295059204102, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8687757253646851, + "num_tokens": 694780506.0, + "step": 18211 + }, + { + "epoch": 2.31675359369037, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.9420422315597534, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8499473333358765, + "num_tokens": 694813533.0, + "step": 18212 + }, + { + "epoch": 2.3168808039689606, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.8204782009124756, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8692348003387451, + "num_tokens": 694847906.0, + "step": 18213 + }, + { + "epoch": 2.317008014247551, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6931706666946411, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.875174880027771, + "num_tokens": 694883872.0, + "step": 18214 + }, + { + "epoch": 2.3171352245261416, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.5892423391342163, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8783591389656067, + "num_tokens": 694925434.0, + "step": 18215 + }, + { + "epoch": 2.317262434804732, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6632159948349, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8728893399238586, + "num_tokens": 694963704.0, + "step": 18216 + }, + { + "epoch": 2.3173896450833227, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5141980648040771, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8898801803588867, + "num_tokens": 695001819.0, + "step": 18217 + }, + { + "epoch": 2.317516855361913, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5421452522277832, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.883415699005127, + "num_tokens": 695040581.0, + "step": 18218 + }, + { + "epoch": 2.3176440656405037, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.613247275352478, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8760342597961426, + "num_tokens": 695081853.0, + "step": 18219 + }, + { + "epoch": 2.3177712759190943, + "ewc_loss": 2.753734588623047e-05, + "grad_norm": 1.6683849096298218, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8628560304641724, + "num_tokens": 695121148.0, + "step": 18220 + }, + { + "epoch": 2.317898486197685, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7069716453552246, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8746066093444824, + "num_tokens": 695154108.0, + "step": 18221 + }, + { + "epoch": 2.3180256964762753, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7120002508163452, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8791725039482117, + "num_tokens": 695189257.0, + "step": 18222 + }, + { + "epoch": 2.318152906754866, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6254138946533203, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8754295110702515, + "num_tokens": 695230900.0, + "step": 18223 + }, + { + "epoch": 2.3182801170334564, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6304577589035034, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.884537935256958, + "num_tokens": 695265514.0, + "step": 18224 + }, + { + "epoch": 2.318407327312047, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.810167670249939, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.862852156162262, + "num_tokens": 695299055.0, + "step": 18225 + }, + { + "epoch": 2.3185345375906374, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7155548334121704, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8837683200836182, + "num_tokens": 695331197.0, + "step": 18226 + }, + { + "epoch": 2.318661747869228, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6346588134765625, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8807104825973511, + "num_tokens": 695370497.0, + "step": 18227 + }, + { + "epoch": 2.3187889581478185, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6114888191223145, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8655371069908142, + "num_tokens": 695411329.0, + "step": 18228 + }, + { + "epoch": 2.318916168426409, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.602579951286316, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8801482319831848, + "num_tokens": 695450633.0, + "step": 18229 + }, + { + "epoch": 2.3190433787049995, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5717365741729736, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8806046843528748, + "num_tokens": 695496048.0, + "step": 18230 + }, + { + "epoch": 2.3191705889835896, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7606711387634277, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8725442886352539, + "num_tokens": 695531482.0, + "step": 18231 + }, + { + "epoch": 2.3192977992621806, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.684067964553833, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8766723871231079, + "num_tokens": 695567349.0, + "step": 18232 + }, + { + "epoch": 2.3194250095407707, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6566367149353027, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8775193691253662, + "num_tokens": 695604607.0, + "step": 18233 + }, + { + "epoch": 2.319552219819361, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 4.686140537261963, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8868985176086426, + "num_tokens": 695640158.0, + "step": 18234 + }, + { + "epoch": 2.3196794300979517, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6374093294143677, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8815053701400757, + "num_tokens": 695677502.0, + "step": 18235 + }, + { + "epoch": 2.3198066403765423, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6176097393035889, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8733015060424805, + "num_tokens": 695715837.0, + "step": 18236 + }, + { + "epoch": 2.319933850655133, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7125754356384277, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8766512274742126, + "num_tokens": 695750697.0, + "step": 18237 + }, + { + "epoch": 2.3200610609337233, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7446808815002441, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8729771375656128, + "num_tokens": 695787019.0, + "step": 18238 + }, + { + "epoch": 2.320188271212314, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7261450290679932, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8654725551605225, + "num_tokens": 695824269.0, + "step": 18239 + }, + { + "epoch": 2.3203154814909044, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.58907151222229, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8766398429870605, + "num_tokens": 695862828.0, + "step": 18240 + }, + { + "epoch": 2.320442691769495, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6220284700393677, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8817748427391052, + "num_tokens": 695900232.0, + "step": 18241 + }, + { + "epoch": 2.3205699020480854, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5937480926513672, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.878018856048584, + "num_tokens": 695941088.0, + "step": 18242 + }, + { + "epoch": 2.320697112326676, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.4822124242782593, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8798172473907471, + "num_tokens": 695983782.0, + "step": 18243 + }, + { + "epoch": 2.3208243226052665, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6162381172180176, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8726987838745117, + "num_tokens": 696022462.0, + "step": 18244 + }, + { + "epoch": 2.320951532883857, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6180793046951294, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8619413375854492, + "num_tokens": 696060705.0, + "step": 18245 + }, + { + "epoch": 2.3210787431624476, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6964020729064941, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8839514255523682, + "num_tokens": 696095644.0, + "step": 18246 + }, + { + "epoch": 2.321205953441038, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.74639093875885, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8806957602500916, + "num_tokens": 696129145.0, + "step": 18247 + }, + { + "epoch": 2.3213331637196286, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5909886360168457, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.888071596622467, + "num_tokens": 696170121.0, + "step": 18248 + }, + { + "epoch": 2.321460373998219, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.9743964672088623, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8696829080581665, + "num_tokens": 696199462.0, + "step": 18249 + }, + { + "epoch": 2.3215875842768097, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7588965892791748, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.879287838935852, + "num_tokens": 696234779.0, + "step": 18250 + }, + { + "epoch": 2.3217147945554, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5699920654296875, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8840049505233765, + "num_tokens": 696275634.0, + "step": 18251 + }, + { + "epoch": 2.3218420048339907, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5534424781799316, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8861594200134277, + "num_tokens": 696314893.0, + "step": 18252 + }, + { + "epoch": 2.3219692151125813, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.717215657234192, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8705991506576538, + "num_tokens": 696353191.0, + "step": 18253 + }, + { + "epoch": 2.322096425391172, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.752763032913208, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8788178563117981, + "num_tokens": 696384465.0, + "step": 18254 + }, + { + "epoch": 2.3222236356697623, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.8167712688446045, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8721579909324646, + "num_tokens": 696417410.0, + "step": 18255 + }, + { + "epoch": 2.3223508459483524, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.620535969734192, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8891571760177612, + "num_tokens": 696454207.0, + "step": 18256 + }, + { + "epoch": 2.3224780562269434, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5490912199020386, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8827515840530396, + "num_tokens": 696492916.0, + "step": 18257 + }, + { + "epoch": 2.3226052665055335, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7808691263198853, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8758058547973633, + "num_tokens": 696526537.0, + "step": 18258 + }, + { + "epoch": 2.322732476784124, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7647991180419922, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8664079308509827, + "num_tokens": 696563006.0, + "step": 18259 + }, + { + "epoch": 2.3228596870627145, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6956567764282227, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.879676342010498, + "num_tokens": 696597309.0, + "step": 18260 + }, + { + "epoch": 2.322986897341305, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.713798999786377, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.872547447681427, + "num_tokens": 696634787.0, + "step": 18261 + }, + { + "epoch": 2.3231141076198956, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7707794904708862, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8724822402000427, + "num_tokens": 696667688.0, + "step": 18262 + }, + { + "epoch": 2.323241317898486, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5553040504455566, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8919135332107544, + "num_tokens": 696704278.0, + "step": 18263 + }, + { + "epoch": 2.3233685281770766, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5786387920379639, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8728586435317993, + "num_tokens": 696745325.0, + "step": 18264 + }, + { + "epoch": 2.323495738455667, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6738922595977783, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8725298643112183, + "num_tokens": 696786162.0, + "step": 18265 + }, + { + "epoch": 2.3236229487342577, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6735197305679321, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8842071294784546, + "num_tokens": 696821055.0, + "step": 18266 + }, + { + "epoch": 2.323750159012848, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6115869283676147, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8766517639160156, + "num_tokens": 696859552.0, + "step": 18267 + }, + { + "epoch": 2.3238773692914387, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6066803932189941, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8788182139396667, + "num_tokens": 696900319.0, + "step": 18268 + }, + { + "epoch": 2.3240045795700293, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5357937812805176, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8792201280593872, + "num_tokens": 696942890.0, + "step": 18269 + }, + { + "epoch": 2.32413178984862, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.552018642425537, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8740410804748535, + "num_tokens": 696983482.0, + "step": 18270 + }, + { + "epoch": 2.3242590001272103, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5576741695404053, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8867268562316895, + "num_tokens": 697025076.0, + "step": 18271 + }, + { + "epoch": 2.324386210405801, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6419689655303955, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.872384250164032, + "num_tokens": 697066324.0, + "step": 18272 + }, + { + "epoch": 2.3245134206843914, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6251866817474365, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8716791272163391, + "num_tokens": 697102973.0, + "step": 18273 + }, + { + "epoch": 2.324640630962982, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5378729104995728, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.889764666557312, + "num_tokens": 697138357.0, + "step": 18274 + }, + { + "epoch": 2.3247678412415724, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5527658462524414, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8754416704177856, + "num_tokens": 697179935.0, + "step": 18275 + }, + { + "epoch": 2.324895051520163, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5502692461013794, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.888054609298706, + "num_tokens": 697219284.0, + "step": 18276 + }, + { + "epoch": 2.3250222617987535, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.639660120010376, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8762789368629456, + "num_tokens": 697257817.0, + "step": 18277 + }, + { + "epoch": 2.325149472077344, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.568312406539917, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8847203254699707, + "num_tokens": 697297234.0, + "step": 18278 + }, + { + "epoch": 2.3252766823559345, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6386429071426392, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.868975043296814, + "num_tokens": 697337080.0, + "step": 18279 + }, + { + "epoch": 2.325403892634525, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.687708854675293, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8880689144134521, + "num_tokens": 697369816.0, + "step": 18280 + }, + { + "epoch": 2.325531102913115, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6547454595565796, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8655322790145874, + "num_tokens": 697410506.0, + "step": 18281 + }, + { + "epoch": 2.325658313191706, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6887543201446533, + "learning_rate": 1e-06, + "loss": 0.3374, + "mean_token_accuracy": 0.8902864456176758, + "num_tokens": 697445180.0, + "step": 18282 + }, + { + "epoch": 2.325785523470296, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7318964004516602, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8581120371818542, + "num_tokens": 697485994.0, + "step": 18283 + }, + { + "epoch": 2.3259127337488867, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5978912115097046, + "learning_rate": 1e-06, + "loss": 0.31, + "mean_token_accuracy": 0.897180438041687, + "num_tokens": 697521267.0, + "step": 18284 + }, + { + "epoch": 2.3260399440274773, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7754056453704834, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8686982989311218, + "num_tokens": 697554008.0, + "step": 18285 + }, + { + "epoch": 2.326167154306068, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5257779359817505, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.888847291469574, + "num_tokens": 697592020.0, + "step": 18286 + }, + { + "epoch": 2.3262943645846583, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6303558349609375, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8670900464057922, + "num_tokens": 697634449.0, + "step": 18287 + }, + { + "epoch": 2.326421574863249, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6236430406570435, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8737235069274902, + "num_tokens": 697674231.0, + "step": 18288 + }, + { + "epoch": 2.3265487851418394, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5592907667160034, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8692943453788757, + "num_tokens": 697716358.0, + "step": 18289 + }, + { + "epoch": 2.32667599542043, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6073795557022095, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8764804601669312, + "num_tokens": 697756942.0, + "step": 18290 + }, + { + "epoch": 2.3268032056990204, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7980459928512573, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8735666275024414, + "num_tokens": 697796131.0, + "step": 18291 + }, + { + "epoch": 2.326930415977611, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6956239938735962, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.882642388343811, + "num_tokens": 697832691.0, + "step": 18292 + }, + { + "epoch": 2.3270576262562015, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7340707778930664, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8815686702728271, + "num_tokens": 697867419.0, + "step": 18293 + }, + { + "epoch": 2.327184836534792, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.658414602279663, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8766828775405884, + "num_tokens": 697907386.0, + "step": 18294 + }, + { + "epoch": 2.3273120468133826, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6901426315307617, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8703269958496094, + "num_tokens": 697945958.0, + "step": 18295 + }, + { + "epoch": 2.327439257091973, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7286728620529175, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8517016172409058, + "num_tokens": 697983771.0, + "step": 18296 + }, + { + "epoch": 2.3275664673705636, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6271941661834717, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8706756234169006, + "num_tokens": 698021716.0, + "step": 18297 + }, + { + "epoch": 2.327693677649154, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5852032899856567, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8675251603126526, + "num_tokens": 698063143.0, + "step": 18298 + }, + { + "epoch": 2.3278208879277447, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6608086824417114, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8748152852058411, + "num_tokens": 698102389.0, + "step": 18299 + }, + { + "epoch": 2.327948098206335, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.747631311416626, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8674034476280212, + "num_tokens": 698139898.0, + "step": 18300 + }, + { + "epoch": 2.3280753084849257, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.789897084236145, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.865181565284729, + "num_tokens": 698177486.0, + "step": 18301 + }, + { + "epoch": 2.3282025187635162, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6648261547088623, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8786435723304749, + "num_tokens": 698212207.0, + "step": 18302 + }, + { + "epoch": 2.3283297290421068, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6719647645950317, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8857986927032471, + "num_tokens": 698247782.0, + "step": 18303 + }, + { + "epoch": 2.3284569393206973, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.8235455751419067, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8764317035675049, + "num_tokens": 698285229.0, + "step": 18304 + }, + { + "epoch": 2.328584149599288, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6903111934661865, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8770862817764282, + "num_tokens": 698321738.0, + "step": 18305 + }, + { + "epoch": 2.328711359877878, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5222057104110718, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.88962721824646, + "num_tokens": 698361221.0, + "step": 18306 + }, + { + "epoch": 2.328838570156469, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6959072351455688, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8664909601211548, + "num_tokens": 698398810.0, + "step": 18307 + }, + { + "epoch": 2.328965780435059, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.631893277168274, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.862392783164978, + "num_tokens": 698443241.0, + "step": 18308 + }, + { + "epoch": 2.3290929907136495, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7849290370941162, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8734742403030396, + "num_tokens": 698481337.0, + "step": 18309 + }, + { + "epoch": 2.32922020099224, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5252165794372559, + "learning_rate": 1e-06, + "loss": 0.3259, + "mean_token_accuracy": 0.8926007151603699, + "num_tokens": 698522423.0, + "step": 18310 + }, + { + "epoch": 2.3293474112708306, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6812602281570435, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8746863007545471, + "num_tokens": 698559317.0, + "step": 18311 + }, + { + "epoch": 2.329474621549421, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.7673832178115845, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8790677785873413, + "num_tokens": 698592145.0, + "step": 18312 + }, + { + "epoch": 2.3296018318280116, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.717771053314209, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8676798939704895, + "num_tokens": 698633230.0, + "step": 18313 + }, + { + "epoch": 2.329729042106602, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.755476474761963, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8837500810623169, + "num_tokens": 698672032.0, + "step": 18314 + }, + { + "epoch": 2.3298562523851927, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7740349769592285, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8820574283599854, + "num_tokens": 698708264.0, + "step": 18315 + }, + { + "epoch": 2.329983462663783, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.8169556856155396, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8708711266517639, + "num_tokens": 698744203.0, + "step": 18316 + }, + { + "epoch": 2.3301106729423737, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7900257110595703, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8690295815467834, + "num_tokens": 698777112.0, + "step": 18317 + }, + { + "epoch": 2.3302378832209643, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7202155590057373, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8766216039657593, + "num_tokens": 698813168.0, + "step": 18318 + }, + { + "epoch": 2.330365093499555, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.813062310218811, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8794090747833252, + "num_tokens": 698846952.0, + "step": 18319 + }, + { + "epoch": 2.3304923037781453, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7158907651901245, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8645190000534058, + "num_tokens": 698885440.0, + "step": 18320 + }, + { + "epoch": 2.330619514056736, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6415904760360718, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8923113346099854, + "num_tokens": 698921735.0, + "step": 18321 + }, + { + "epoch": 2.3307467243353264, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7286100387573242, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8709197640419006, + "num_tokens": 698960134.0, + "step": 18322 + }, + { + "epoch": 2.330873934613917, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6941553354263306, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.883346438407898, + "num_tokens": 698993602.0, + "step": 18323 + }, + { + "epoch": 2.3310011448925074, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5932986736297607, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.880481481552124, + "num_tokens": 699030670.0, + "step": 18324 + }, + { + "epoch": 2.331128355171098, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5418167114257812, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8824280500411987, + "num_tokens": 699073042.0, + "step": 18325 + }, + { + "epoch": 2.3312555654496885, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6961928606033325, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8751702308654785, + "num_tokens": 699109046.0, + "step": 18326 + }, + { + "epoch": 2.331382775728279, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5927897691726685, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8834928274154663, + "num_tokens": 699149803.0, + "step": 18327 + }, + { + "epoch": 2.3315099860068695, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5449947118759155, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8752806186676025, + "num_tokens": 699190457.0, + "step": 18328 + }, + { + "epoch": 2.3316371962854596, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6627520322799683, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8664077520370483, + "num_tokens": 699232366.0, + "step": 18329 + }, + { + "epoch": 2.3317644065640506, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6241142749786377, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8667234182357788, + "num_tokens": 699271795.0, + "step": 18330 + }, + { + "epoch": 2.3318916168426407, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6396430730819702, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8633229732513428, + "num_tokens": 699312496.0, + "step": 18331 + }, + { + "epoch": 2.332018827121231, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.5650564432144165, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8816691637039185, + "num_tokens": 699352448.0, + "step": 18332 + }, + { + "epoch": 2.3321460373998217, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5088740587234497, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8693536520004272, + "num_tokens": 699397067.0, + "step": 18333 + }, + { + "epoch": 2.3322732476784123, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.552561640739441, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8830511569976807, + "num_tokens": 699437740.0, + "step": 18334 + }, + { + "epoch": 2.332400457957003, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6031086444854736, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8854236602783203, + "num_tokens": 699473977.0, + "step": 18335 + }, + { + "epoch": 2.3325276682355933, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6536269187927246, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8786388039588928, + "num_tokens": 699512907.0, + "step": 18336 + }, + { + "epoch": 2.332654878514184, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5397112369537354, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8801095485687256, + "num_tokens": 699558825.0, + "step": 18337 + }, + { + "epoch": 2.3327820887927744, + "ewc_loss": 2.765655517578125e-05, + "grad_norm": 1.6648083925247192, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.857132613658905, + "num_tokens": 699599819.0, + "step": 18338 + }, + { + "epoch": 2.332909299071365, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.8436760902404785, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8715949058532715, + "num_tokens": 699629016.0, + "step": 18339 + }, + { + "epoch": 2.3330365093499554, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6434595584869385, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8833422064781189, + "num_tokens": 699665038.0, + "step": 18340 + }, + { + "epoch": 2.333163719628546, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5817111730575562, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.891137957572937, + "num_tokens": 699704615.0, + "step": 18341 + }, + { + "epoch": 2.3332909299071365, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5867297649383545, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8733997941017151, + "num_tokens": 699750149.0, + "step": 18342 + }, + { + "epoch": 2.333418140185727, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.609980821609497, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8859034180641174, + "num_tokens": 699788445.0, + "step": 18343 + }, + { + "epoch": 2.3335453504643175, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.8971225023269653, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8367036581039429, + "num_tokens": 699821389.0, + "step": 18344 + }, + { + "epoch": 2.333672560742908, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.620502233505249, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8830219507217407, + "num_tokens": 699861252.0, + "step": 18345 + }, + { + "epoch": 2.3337997710214986, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5812329053878784, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8714891672134399, + "num_tokens": 699905858.0, + "step": 18346 + }, + { + "epoch": 2.333926981300089, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.693904995918274, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8662576079368591, + "num_tokens": 699943605.0, + "step": 18347 + }, + { + "epoch": 2.3340541915786797, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6012696027755737, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.866381824016571, + "num_tokens": 699982140.0, + "step": 18348 + }, + { + "epoch": 2.33418140185727, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.9815824031829834, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.878628134727478, + "num_tokens": 700018053.0, + "step": 18349 + }, + { + "epoch": 2.3343086121358607, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6195672750473022, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8631958365440369, + "num_tokens": 700060612.0, + "step": 18350 + }, + { + "epoch": 2.3344358224144512, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.4817432165145874, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8727060556411743, + "num_tokens": 700105401.0, + "step": 18351 + }, + { + "epoch": 2.3345630326930418, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7582979202270508, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8769601583480835, + "num_tokens": 700144225.0, + "step": 18352 + }, + { + "epoch": 2.3346902429716323, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.786852478981018, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8711621761322021, + "num_tokens": 700177982.0, + "step": 18353 + }, + { + "epoch": 2.3348174532502224, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6404616832733154, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8633853197097778, + "num_tokens": 700216871.0, + "step": 18354 + }, + { + "epoch": 2.3349446635288134, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6913377046585083, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.868492841720581, + "num_tokens": 700258729.0, + "step": 18355 + }, + { + "epoch": 2.3350718738074034, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6942470073699951, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8589171171188354, + "num_tokens": 700302407.0, + "step": 18356 + }, + { + "epoch": 2.335199084085994, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6997625827789307, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8740343451499939, + "num_tokens": 700337757.0, + "step": 18357 + }, + { + "epoch": 2.3353262943645845, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6181405782699585, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8668482303619385, + "num_tokens": 700377381.0, + "step": 18358 + }, + { + "epoch": 2.335453504643175, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.71629798412323, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.868912398815155, + "num_tokens": 700415803.0, + "step": 18359 + }, + { + "epoch": 2.3355807149217656, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.65542471408844, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8627883195877075, + "num_tokens": 700456511.0, + "step": 18360 + }, + { + "epoch": 2.335707925200356, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.8026974201202393, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8861409425735474, + "num_tokens": 700488220.0, + "step": 18361 + }, + { + "epoch": 2.3358351354789466, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7305959463119507, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8741457462310791, + "num_tokens": 700524257.0, + "step": 18362 + }, + { + "epoch": 2.335962345757537, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5867623090744019, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8726392984390259, + "num_tokens": 700566664.0, + "step": 18363 + }, + { + "epoch": 2.3360895560361277, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.4841058254241943, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8753841519355774, + "num_tokens": 700610342.0, + "step": 18364 + }, + { + "epoch": 2.336216766314718, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6252561807632446, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8821332454681396, + "num_tokens": 700644316.0, + "step": 18365 + }, + { + "epoch": 2.3363439765933087, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7087442874908447, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8750922679901123, + "num_tokens": 700676142.0, + "step": 18366 + }, + { + "epoch": 2.3364711868718993, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.5765243768692017, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8798593282699585, + "num_tokens": 700717015.0, + "step": 18367 + }, + { + "epoch": 2.33659839715049, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6036087274551392, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8700275421142578, + "num_tokens": 700756634.0, + "step": 18368 + }, + { + "epoch": 2.3367256074290803, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.678871750831604, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8884108066558838, + "num_tokens": 700793601.0, + "step": 18369 + }, + { + "epoch": 2.336852817707671, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6355761289596558, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8738541603088379, + "num_tokens": 700832719.0, + "step": 18370 + }, + { + "epoch": 2.3369800279862614, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6529128551483154, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8850857019424438, + "num_tokens": 700868450.0, + "step": 18371 + }, + { + "epoch": 2.337107238264852, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.6789960861206055, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8715864419937134, + "num_tokens": 700906796.0, + "step": 18372 + }, + { + "epoch": 2.3372344485434424, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.502549648284912, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8870025873184204, + "num_tokens": 700948618.0, + "step": 18373 + }, + { + "epoch": 2.337361658822033, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 2.1817245483398438, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.893052339553833, + "num_tokens": 700988986.0, + "step": 18374 + }, + { + "epoch": 2.3374888691006235, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.7578977346420288, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8677000999450684, + "num_tokens": 701024665.0, + "step": 18375 + }, + { + "epoch": 2.337616079379214, + "ewc_loss": 2.777576446533203e-05, + "grad_norm": 1.639072060585022, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8768637180328369, + "num_tokens": 701063889.0, + "step": 18376 + }, + { + "epoch": 2.3377432896578045, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.628220796585083, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8651211857795715, + "num_tokens": 701107915.0, + "step": 18377 + }, + { + "epoch": 2.337870499936395, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.696845293045044, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8689016103744507, + "num_tokens": 701146201.0, + "step": 18378 + }, + { + "epoch": 2.337997710214985, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.765655279159546, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8612408638000488, + "num_tokens": 701187909.0, + "step": 18379 + }, + { + "epoch": 2.338124920493576, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6355836391448975, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8821866512298584, + "num_tokens": 701224490.0, + "step": 18380 + }, + { + "epoch": 2.338252130772166, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6785775423049927, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8574658632278442, + "num_tokens": 701268093.0, + "step": 18381 + }, + { + "epoch": 2.3383793410507567, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7125309705734253, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8741669654846191, + "num_tokens": 701305299.0, + "step": 18382 + }, + { + "epoch": 2.3385065513293473, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.805531620979309, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8724527359008789, + "num_tokens": 701336886.0, + "step": 18383 + }, + { + "epoch": 2.338633761607938, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6128969192504883, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8847092390060425, + "num_tokens": 701377114.0, + "step": 18384 + }, + { + "epoch": 2.3387609718865283, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.608496069908142, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8730477094650269, + "num_tokens": 701415590.0, + "step": 18385 + }, + { + "epoch": 2.338888182165119, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 2.0649561882019043, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8766145706176758, + "num_tokens": 701454323.0, + "step": 18386 + }, + { + "epoch": 2.3390153924437094, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5984511375427246, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8691558837890625, + "num_tokens": 701495331.0, + "step": 18387 + }, + { + "epoch": 2.3391426027223, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6222882270812988, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8861215710639954, + "num_tokens": 701531896.0, + "step": 18388 + }, + { + "epoch": 2.3392698130008904, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 2.255970001220703, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8768512010574341, + "num_tokens": 701567200.0, + "step": 18389 + }, + { + "epoch": 2.339397023279481, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6750006675720215, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8690221309661865, + "num_tokens": 701602254.0, + "step": 18390 + }, + { + "epoch": 2.3395242335580715, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5070521831512451, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.87373948097229, + "num_tokens": 701647037.0, + "step": 18391 + }, + { + "epoch": 2.339651443836662, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.674012303352356, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8883928060531616, + "num_tokens": 701680003.0, + "step": 18392 + }, + { + "epoch": 2.3397786541152525, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5562200546264648, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8889099359512329, + "num_tokens": 701722388.0, + "step": 18393 + }, + { + "epoch": 2.339905864393843, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6712651252746582, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8816391229629517, + "num_tokens": 701758184.0, + "step": 18394 + }, + { + "epoch": 2.3400330746724336, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.640627145767212, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8806954622268677, + "num_tokens": 701793656.0, + "step": 18395 + }, + { + "epoch": 2.340160284951024, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6345152854919434, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.880085825920105, + "num_tokens": 701833985.0, + "step": 18396 + }, + { + "epoch": 2.3402874952296147, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7524343729019165, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8754514455795288, + "num_tokens": 701866499.0, + "step": 18397 + }, + { + "epoch": 2.340414705508205, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.478757619857788, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8783489465713501, + "num_tokens": 701912740.0, + "step": 18398 + }, + { + "epoch": 2.3405419157867957, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5441250801086426, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8743250370025635, + "num_tokens": 701957816.0, + "step": 18399 + }, + { + "epoch": 2.3406691260653862, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7114787101745605, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.869806170463562, + "num_tokens": 701999032.0, + "step": 18400 + }, + { + "epoch": 2.3407963363439768, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7198745012283325, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8762128353118896, + "num_tokens": 702033751.0, + "step": 18401 + }, + { + "epoch": 2.3409235466225673, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7925727367401123, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8652838468551636, + "num_tokens": 702072121.0, + "step": 18402 + }, + { + "epoch": 2.341050756901158, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7030647993087769, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8785812258720398, + "num_tokens": 702104170.0, + "step": 18403 + }, + { + "epoch": 2.341177967179748, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7222673892974854, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8734688758850098, + "num_tokens": 702138561.0, + "step": 18404 + }, + { + "epoch": 2.341305177458339, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8548784255981445, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8615223169326782, + "num_tokens": 702175743.0, + "step": 18405 + }, + { + "epoch": 2.341432387736929, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7859827280044556, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8547295331954956, + "num_tokens": 702211013.0, + "step": 18406 + }, + { + "epoch": 2.3415595980155195, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7537761926651, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.86561518907547, + "num_tokens": 702248950.0, + "step": 18407 + }, + { + "epoch": 2.34168680829411, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6654256582260132, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8647947907447815, + "num_tokens": 702288992.0, + "step": 18408 + }, + { + "epoch": 2.3418140185727006, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.748300313949585, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8770508766174316, + "num_tokens": 702322879.0, + "step": 18409 + }, + { + "epoch": 2.341941228851291, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.589838981628418, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8812332153320312, + "num_tokens": 702363303.0, + "step": 18410 + }, + { + "epoch": 2.3420684391298816, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.668177843093872, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8778480291366577, + "num_tokens": 702397300.0, + "step": 18411 + }, + { + "epoch": 2.342195649408472, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6154155731201172, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8789076209068298, + "num_tokens": 702435689.0, + "step": 18412 + }, + { + "epoch": 2.3423228596870627, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6118264198303223, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8726221323013306, + "num_tokens": 702478765.0, + "step": 18413 + }, + { + "epoch": 2.342450069965653, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7833856344223022, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8691873550415039, + "num_tokens": 702512025.0, + "step": 18414 + }, + { + "epoch": 2.3425772802442437, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5630323886871338, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8844805359840393, + "num_tokens": 702550568.0, + "step": 18415 + }, + { + "epoch": 2.3427044905228342, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5463899374008179, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8850662708282471, + "num_tokens": 702591475.0, + "step": 18416 + }, + { + "epoch": 2.3428317008014248, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6805158853530884, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8679577708244324, + "num_tokens": 702629145.0, + "step": 18417 + }, + { + "epoch": 2.3429589110800153, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7441074848175049, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8677688241004944, + "num_tokens": 702664285.0, + "step": 18418 + }, + { + "epoch": 2.343086121358606, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5233383178710938, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8729652166366577, + "num_tokens": 702705329.0, + "step": 18419 + }, + { + "epoch": 2.3432133316371964, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.663073182106018, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8773527145385742, + "num_tokens": 702741873.0, + "step": 18420 + }, + { + "epoch": 2.343340541915787, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8636140823364258, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8744803667068481, + "num_tokens": 702774638.0, + "step": 18421 + }, + { + "epoch": 2.3434677521943774, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.761769413948059, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8812516927719116, + "num_tokens": 702806017.0, + "step": 18422 + }, + { + "epoch": 2.343594962472968, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7150399684906006, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8686926364898682, + "num_tokens": 702844491.0, + "step": 18423 + }, + { + "epoch": 2.3437221727515585, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6644890308380127, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8691469430923462, + "num_tokens": 702885376.0, + "step": 18424 + }, + { + "epoch": 2.343849383030149, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6548840999603271, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8792450428009033, + "num_tokens": 702925133.0, + "step": 18425 + }, + { + "epoch": 2.3439765933087395, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5685551166534424, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8827167749404907, + "num_tokens": 702963519.0, + "step": 18426 + }, + { + "epoch": 2.3441038035873296, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.725724220275879, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8802292346954346, + "num_tokens": 702999307.0, + "step": 18427 + }, + { + "epoch": 2.3442310138659206, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6049063205718994, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8587677478790283, + "num_tokens": 703038573.0, + "step": 18428 + }, + { + "epoch": 2.3443582241445107, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6303709745407104, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8806307911872864, + "num_tokens": 703076720.0, + "step": 18429 + }, + { + "epoch": 2.344485434423101, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6422978639602661, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8560341000556946, + "num_tokens": 703121050.0, + "step": 18430 + }, + { + "epoch": 2.3446126447016917, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7547531127929688, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8510938286781311, + "num_tokens": 703162301.0, + "step": 18431 + }, + { + "epoch": 2.3447398549802823, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5663728713989258, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8819391131401062, + "num_tokens": 703200560.0, + "step": 18432 + }, + { + "epoch": 2.344867065258873, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4187328815460205, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8891980648040771, + "num_tokens": 703241608.0, + "step": 18433 + }, + { + "epoch": 2.3449942755374633, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5057774782180786, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8839373588562012, + "num_tokens": 703283380.0, + "step": 18434 + }, + { + "epoch": 2.345121485816054, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7243307828903198, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8805898427963257, + "num_tokens": 703316833.0, + "step": 18435 + }, + { + "epoch": 2.3452486960946444, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6751455068588257, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8651549220085144, + "num_tokens": 703355247.0, + "step": 18436 + }, + { + "epoch": 2.345375906373235, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6914409399032593, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8639208078384399, + "num_tokens": 703394332.0, + "step": 18437 + }, + { + "epoch": 2.3455031166518254, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6005139350891113, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8800833225250244, + "num_tokens": 703435355.0, + "step": 18438 + }, + { + "epoch": 2.345630326930416, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.9063464403152466, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8660317659378052, + "num_tokens": 703465809.0, + "step": 18439 + }, + { + "epoch": 2.3457575372090065, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8056721687316895, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8693664073944092, + "num_tokens": 703501258.0, + "step": 18440 + }, + { + "epoch": 2.345884747487597, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.602925419807434, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8708344101905823, + "num_tokens": 703545172.0, + "step": 18441 + }, + { + "epoch": 2.3460119577661875, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7187700271606445, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8603361248970032, + "num_tokens": 703584038.0, + "step": 18442 + }, + { + "epoch": 2.346139168044778, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6789500713348389, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.871103048324585, + "num_tokens": 703623773.0, + "step": 18443 + }, + { + "epoch": 2.3462663783233686, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7253129482269287, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8657575845718384, + "num_tokens": 703657061.0, + "step": 18444 + }, + { + "epoch": 2.346393588601959, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.616361141204834, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8869444131851196, + "num_tokens": 703695509.0, + "step": 18445 + }, + { + "epoch": 2.3465207988805497, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4587275981903076, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8740820288658142, + "num_tokens": 703746332.0, + "step": 18446 + }, + { + "epoch": 2.34664800915914, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6410330533981323, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8803018927574158, + "num_tokens": 703784693.0, + "step": 18447 + }, + { + "epoch": 2.3467752194377307, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4727517366409302, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8850984573364258, + "num_tokens": 703829931.0, + "step": 18448 + }, + { + "epoch": 2.3469024297163212, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5797951221466064, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8574219942092896, + "num_tokens": 703872698.0, + "step": 18449 + }, + { + "epoch": 2.3470296399949118, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6040176153182983, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8784651756286621, + "num_tokens": 703913277.0, + "step": 18450 + }, + { + "epoch": 2.3471568502735023, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6840736865997314, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8786277770996094, + "num_tokens": 703949899.0, + "step": 18451 + }, + { + "epoch": 2.3472840605520924, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6773021221160889, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8806236386299133, + "num_tokens": 703986127.0, + "step": 18452 + }, + { + "epoch": 2.3474112708306833, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.670743465423584, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8674594163894653, + "num_tokens": 704026385.0, + "step": 18453 + }, + { + "epoch": 2.3475384811092734, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6535685062408447, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8778660297393799, + "num_tokens": 704061007.0, + "step": 18454 + }, + { + "epoch": 2.347665691387864, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6468462944030762, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8833577036857605, + "num_tokens": 704100669.0, + "step": 18455 + }, + { + "epoch": 2.3477929016664545, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6349273920059204, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8699610233306885, + "num_tokens": 704141197.0, + "step": 18456 + }, + { + "epoch": 2.347920111945045, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.9058598279953003, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8754348158836365, + "num_tokens": 704169602.0, + "step": 18457 + }, + { + "epoch": 2.3480473222236355, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4193739891052246, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8883630037307739, + "num_tokens": 704217455.0, + "step": 18458 + }, + { + "epoch": 2.348174532502226, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6325968503952026, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8864901065826416, + "num_tokens": 704252775.0, + "step": 18459 + }, + { + "epoch": 2.3483017427808166, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7894246578216553, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8607612252235413, + "num_tokens": 704287928.0, + "step": 18460 + }, + { + "epoch": 2.348428953059407, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5490667819976807, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8719276189804077, + "num_tokens": 704330085.0, + "step": 18461 + }, + { + "epoch": 2.3485561633379977, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6711639165878296, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8887852430343628, + "num_tokens": 704366573.0, + "step": 18462 + }, + { + "epoch": 2.348683373616588, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6455813646316528, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8714074492454529, + "num_tokens": 704404699.0, + "step": 18463 + }, + { + "epoch": 2.3488105838951787, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5647788047790527, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8814607262611389, + "num_tokens": 704443502.0, + "step": 18464 + }, + { + "epoch": 2.3489377941737692, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.816656470298767, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8713403940200806, + "num_tokens": 704475840.0, + "step": 18465 + }, + { + "epoch": 2.3490650044523598, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7286739349365234, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8611865043640137, + "num_tokens": 704514756.0, + "step": 18466 + }, + { + "epoch": 2.3491922147309503, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6763553619384766, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8873447775840759, + "num_tokens": 704549051.0, + "step": 18467 + }, + { + "epoch": 2.349319425009541, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6722187995910645, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8760911226272583, + "num_tokens": 704586527.0, + "step": 18468 + }, + { + "epoch": 2.3494466352881314, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5890978574752808, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8782249093055725, + "num_tokens": 704629301.0, + "step": 18469 + }, + { + "epoch": 2.349573845566722, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.645072102546692, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8796213865280151, + "num_tokens": 704666726.0, + "step": 18470 + }, + { + "epoch": 2.3497010558453124, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7310981750488281, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8811638355255127, + "num_tokens": 704705491.0, + "step": 18471 + }, + { + "epoch": 2.349828266123903, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6821781396865845, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8664276003837585, + "num_tokens": 704743732.0, + "step": 18472 + }, + { + "epoch": 2.3499554764024935, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5798438787460327, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.878142237663269, + "num_tokens": 704781225.0, + "step": 18473 + }, + { + "epoch": 2.350082686681084, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6941276788711548, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.879452109336853, + "num_tokens": 704816960.0, + "step": 18474 + }, + { + "epoch": 2.3502098969596745, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5377546548843384, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8824717998504639, + "num_tokens": 704855530.0, + "step": 18475 + }, + { + "epoch": 2.350337107238265, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4903463125228882, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.881027102470398, + "num_tokens": 704899330.0, + "step": 18476 + }, + { + "epoch": 2.350464317516855, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7141426801681519, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8717432618141174, + "num_tokens": 704938550.0, + "step": 18477 + }, + { + "epoch": 2.350591527795446, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8845107555389404, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8553951978683472, + "num_tokens": 704970749.0, + "step": 18478 + }, + { + "epoch": 2.350718738074036, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5482921600341797, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8808771967887878, + "num_tokens": 705012472.0, + "step": 18479 + }, + { + "epoch": 2.3508459483526267, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.9667150974273682, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8677201271057129, + "num_tokens": 705041755.0, + "step": 18480 + }, + { + "epoch": 2.3509731586312173, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.71895432472229, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8861685991287231, + "num_tokens": 705075678.0, + "step": 18481 + }, + { + "epoch": 2.351100368909808, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5821952819824219, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8844091892242432, + "num_tokens": 705116709.0, + "step": 18482 + }, + { + "epoch": 2.3512275791883983, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.621002435684204, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8754367828369141, + "num_tokens": 705155159.0, + "step": 18483 + }, + { + "epoch": 2.351354789466989, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.692350149154663, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.872352123260498, + "num_tokens": 705193567.0, + "step": 18484 + }, + { + "epoch": 2.3514819997455794, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7625659704208374, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8821889162063599, + "num_tokens": 705228210.0, + "step": 18485 + }, + { + "epoch": 2.35160921002417, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7339279651641846, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.875124990940094, + "num_tokens": 705263904.0, + "step": 18486 + }, + { + "epoch": 2.3517364203027604, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.731091856956482, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8750977516174316, + "num_tokens": 705299147.0, + "step": 18487 + }, + { + "epoch": 2.351863630581351, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6576063632965088, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8727536201477051, + "num_tokens": 705336122.0, + "step": 18488 + }, + { + "epoch": 2.3519908408599415, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8133676052093506, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8834153413772583, + "num_tokens": 705368455.0, + "step": 18489 + }, + { + "epoch": 2.352118051138532, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.664431095123291, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8564474582672119, + "num_tokens": 705409217.0, + "step": 18490 + }, + { + "epoch": 2.3522452614171225, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7532862424850464, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8835844993591309, + "num_tokens": 705443651.0, + "step": 18491 + }, + { + "epoch": 2.352372471695713, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5279425382614136, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8873069286346436, + "num_tokens": 705481667.0, + "step": 18492 + }, + { + "epoch": 2.3524996819743036, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6745448112487793, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8733829259872437, + "num_tokens": 705518984.0, + "step": 18493 + }, + { + "epoch": 2.352626892252894, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7399640083312988, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8505792617797852, + "num_tokens": 705557030.0, + "step": 18494 + }, + { + "epoch": 2.3527541025314846, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6531046628952026, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8707660436630249, + "num_tokens": 705593422.0, + "step": 18495 + }, + { + "epoch": 2.352881312810075, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7612498998641968, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8731549978256226, + "num_tokens": 705627653.0, + "step": 18496 + }, + { + "epoch": 2.3530085230886657, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.537731647491455, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8781344890594482, + "num_tokens": 705676046.0, + "step": 18497 + }, + { + "epoch": 2.3531357333672562, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6218922138214111, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8750733733177185, + "num_tokens": 705715285.0, + "step": 18498 + }, + { + "epoch": 2.3532629436458468, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6888654232025146, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8719520568847656, + "num_tokens": 705750320.0, + "step": 18499 + }, + { + "epoch": 2.353390153924437, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7323580980300903, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8771176338195801, + "num_tokens": 705786804.0, + "step": 18500 + }, + { + "epoch": 2.353517364203028, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6474803686141968, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8855185508728027, + "num_tokens": 705823583.0, + "step": 18501 + }, + { + "epoch": 2.353644574481618, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6488635540008545, + "learning_rate": 1e-06, + "loss": 0.3364, + "mean_token_accuracy": 0.89357590675354, + "num_tokens": 705860229.0, + "step": 18502 + }, + { + "epoch": 2.353771784760209, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6081713438034058, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8801572322845459, + "num_tokens": 705897436.0, + "step": 18503 + }, + { + "epoch": 2.353898995038799, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7514981031417847, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8693185448646545, + "num_tokens": 705934564.0, + "step": 18504 + }, + { + "epoch": 2.3540262053173895, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8106480836868286, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8707960844039917, + "num_tokens": 705969869.0, + "step": 18505 + }, + { + "epoch": 2.35415341559598, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5948140621185303, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8748535513877869, + "num_tokens": 706009951.0, + "step": 18506 + }, + { + "epoch": 2.3542806258745705, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 16.70364761352539, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8786255717277527, + "num_tokens": 706050811.0, + "step": 18507 + }, + { + "epoch": 2.354407836153161, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6296650171279907, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8732448816299438, + "num_tokens": 706088290.0, + "step": 18508 + }, + { + "epoch": 2.3545350464317516, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6340806484222412, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.884757399559021, + "num_tokens": 706123270.0, + "step": 18509 + }, + { + "epoch": 2.354662256710342, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7458524703979492, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8812065124511719, + "num_tokens": 706155330.0, + "step": 18510 + }, + { + "epoch": 2.3547894669889327, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6675562858581543, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.881044328212738, + "num_tokens": 706189855.0, + "step": 18511 + }, + { + "epoch": 2.354916677267523, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4684221744537354, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8836519122123718, + "num_tokens": 706233943.0, + "step": 18512 + }, + { + "epoch": 2.3550438875461137, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7322601079940796, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8735236525535583, + "num_tokens": 706269785.0, + "step": 18513 + }, + { + "epoch": 2.3551710978247042, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.657474398612976, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8700281977653503, + "num_tokens": 706308780.0, + "step": 18514 + }, + { + "epoch": 2.3552983081032948, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7413277626037598, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8786212205886841, + "num_tokens": 706343641.0, + "step": 18515 + }, + { + "epoch": 2.3554255183818853, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6742192506790161, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8817175626754761, + "num_tokens": 706378270.0, + "step": 18516 + }, + { + "epoch": 2.355552728660476, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6177592277526855, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.861568033695221, + "num_tokens": 706419424.0, + "step": 18517 + }, + { + "epoch": 2.3556799389390664, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4972591400146484, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8767659664154053, + "num_tokens": 706464294.0, + "step": 18518 + }, + { + "epoch": 2.355807149217657, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7074674367904663, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8779396414756775, + "num_tokens": 706499584.0, + "step": 18519 + }, + { + "epoch": 2.3559343594962474, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6681747436523438, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.864187479019165, + "num_tokens": 706539542.0, + "step": 18520 + }, + { + "epoch": 2.356061569774838, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.646235466003418, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8682832717895508, + "num_tokens": 706580115.0, + "step": 18521 + }, + { + "epoch": 2.3561887800534285, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8353828191757202, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8704273700714111, + "num_tokens": 706615576.0, + "step": 18522 + }, + { + "epoch": 2.356315990332019, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7028409242630005, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8646548986434937, + "num_tokens": 706652686.0, + "step": 18523 + }, + { + "epoch": 2.3564432006106095, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.613772988319397, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.870712399482727, + "num_tokens": 706689967.0, + "step": 18524 + }, + { + "epoch": 2.3565704108891996, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5639578104019165, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8862956166267395, + "num_tokens": 706727174.0, + "step": 18525 + }, + { + "epoch": 2.3566976211677906, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7035877704620361, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8909293413162231, + "num_tokens": 706757463.0, + "step": 18526 + }, + { + "epoch": 2.3568248314463807, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7992078065872192, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8594461679458618, + "num_tokens": 706793533.0, + "step": 18527 + }, + { + "epoch": 2.356952041724971, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.644849419593811, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8800778985023499, + "num_tokens": 706830112.0, + "step": 18528 + }, + { + "epoch": 2.3570792520035617, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.524105429649353, + "learning_rate": 1e-06, + "loss": 0.3211, + "mean_token_accuracy": 0.8947422504425049, + "num_tokens": 706871315.0, + "step": 18529 + }, + { + "epoch": 2.3572064622821522, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5681320428848267, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8769243955612183, + "num_tokens": 706912879.0, + "step": 18530 + }, + { + "epoch": 2.3573336725607428, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7325435876846313, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8650293350219727, + "num_tokens": 706952199.0, + "step": 18531 + }, + { + "epoch": 2.3574608828393333, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6060742139816284, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8814413547515869, + "num_tokens": 706990489.0, + "step": 18532 + }, + { + "epoch": 2.357588093117924, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.700683832168579, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8806994557380676, + "num_tokens": 707027735.0, + "step": 18533 + }, + { + "epoch": 2.3577153033965144, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8035516738891602, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8882603049278259, + "num_tokens": 707058363.0, + "step": 18534 + }, + { + "epoch": 2.357842513675105, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5247864723205566, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8755941390991211, + "num_tokens": 707101309.0, + "step": 18535 + }, + { + "epoch": 2.3579697239536954, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7028664350509644, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8722434639930725, + "num_tokens": 707141872.0, + "step": 18536 + }, + { + "epoch": 2.358096934232286, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6588788032531738, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8789154887199402, + "num_tokens": 707181598.0, + "step": 18537 + }, + { + "epoch": 2.3582241445108765, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.619606614112854, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.86496502161026, + "num_tokens": 707225589.0, + "step": 18538 + }, + { + "epoch": 2.358351354789467, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5788402557373047, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8764235377311707, + "num_tokens": 707270828.0, + "step": 18539 + }, + { + "epoch": 2.3584785650680575, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6505521535873413, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8846650719642639, + "num_tokens": 707305743.0, + "step": 18540 + }, + { + "epoch": 2.358605775346648, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5454689264297485, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8738659620285034, + "num_tokens": 707344019.0, + "step": 18541 + }, + { + "epoch": 2.3587329856252386, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5813552141189575, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8792448043823242, + "num_tokens": 707381586.0, + "step": 18542 + }, + { + "epoch": 2.358860195903829, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.702526569366455, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8623253107070923, + "num_tokens": 707417421.0, + "step": 18543 + }, + { + "epoch": 2.3589874061824196, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5925935506820679, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8606184720993042, + "num_tokens": 707459206.0, + "step": 18544 + }, + { + "epoch": 2.35911461646101, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5720568895339966, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8732442855834961, + "num_tokens": 707500205.0, + "step": 18545 + }, + { + "epoch": 2.3592418267396007, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5917177200317383, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.882718026638031, + "num_tokens": 707537215.0, + "step": 18546 + }, + { + "epoch": 2.3593690370181912, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6200910806655884, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.870261013507843, + "num_tokens": 707578224.0, + "step": 18547 + }, + { + "epoch": 2.3594962472967818, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6326584815979004, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8824024200439453, + "num_tokens": 707614138.0, + "step": 18548 + }, + { + "epoch": 2.3596234575753723, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.682544231414795, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8756210803985596, + "num_tokens": 707656415.0, + "step": 18549 + }, + { + "epoch": 2.3597506678539624, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6222957372665405, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.886180579662323, + "num_tokens": 707693106.0, + "step": 18550 + }, + { + "epoch": 2.3598778781325533, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.597076416015625, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8854078054428101, + "num_tokens": 707729425.0, + "step": 18551 + }, + { + "epoch": 2.3600050884111434, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.855261206626892, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8650820255279541, + "num_tokens": 707764097.0, + "step": 18552 + }, + { + "epoch": 2.360132298689734, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6036933660507202, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8879708051681519, + "num_tokens": 707801757.0, + "step": 18553 + }, + { + "epoch": 2.3602595089683245, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6942437887191772, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8837108612060547, + "num_tokens": 707840524.0, + "step": 18554 + }, + { + "epoch": 2.360386719246915, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.608088493347168, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8736954927444458, + "num_tokens": 707876627.0, + "step": 18555 + }, + { + "epoch": 2.3605139295255055, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8805067539215088, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8783042430877686, + "num_tokens": 707909119.0, + "step": 18556 + }, + { + "epoch": 2.360641139804096, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6639820337295532, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8758319616317749, + "num_tokens": 707945428.0, + "step": 18557 + }, + { + "epoch": 2.3607683500826866, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7803480625152588, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8809477090835571, + "num_tokens": 707980565.0, + "step": 18558 + }, + { + "epoch": 2.360895560361277, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.841566562652588, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8611221313476562, + "num_tokens": 708017737.0, + "step": 18559 + }, + { + "epoch": 2.3610227706398677, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6265568733215332, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8765251040458679, + "num_tokens": 708057416.0, + "step": 18560 + }, + { + "epoch": 2.361149980918458, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6665737628936768, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8759746551513672, + "num_tokens": 708094590.0, + "step": 18561 + }, + { + "epoch": 2.3612771911970487, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6243367195129395, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8821070194244385, + "num_tokens": 708135915.0, + "step": 18562 + }, + { + "epoch": 2.3614044014756392, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5998454093933105, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8797814846038818, + "num_tokens": 708177335.0, + "step": 18563 + }, + { + "epoch": 2.3615316117542298, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7010066509246826, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8775238990783691, + "num_tokens": 708210990.0, + "step": 18564 + }, + { + "epoch": 2.3616588220328203, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6394495964050293, + "learning_rate": 1e-06, + "loss": 0.3322, + "mean_token_accuracy": 0.8917810916900635, + "num_tokens": 708247060.0, + "step": 18565 + }, + { + "epoch": 2.361786032311411, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5405157804489136, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8736885786056519, + "num_tokens": 708287487.0, + "step": 18566 + }, + { + "epoch": 2.3619132425900013, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6620793342590332, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8802369832992554, + "num_tokens": 708321224.0, + "step": 18567 + }, + { + "epoch": 2.362040452868592, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7612541913986206, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8816393613815308, + "num_tokens": 708351462.0, + "step": 18568 + }, + { + "epoch": 2.3621676631471824, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7379640340805054, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8818261623382568, + "num_tokens": 708386078.0, + "step": 18569 + }, + { + "epoch": 2.362294873425773, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8988384008407593, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8717424869537354, + "num_tokens": 708417284.0, + "step": 18570 + }, + { + "epoch": 2.3624220837043635, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7679264545440674, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8579638004302979, + "num_tokens": 708452319.0, + "step": 18571 + }, + { + "epoch": 2.362549293982954, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6227853298187256, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.882265567779541, + "num_tokens": 708488662.0, + "step": 18572 + }, + { + "epoch": 2.3626765042615445, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7699189186096191, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8735525012016296, + "num_tokens": 708523278.0, + "step": 18573 + }, + { + "epoch": 2.362803714540135, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8794660568237305, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8792198896408081, + "num_tokens": 708554461.0, + "step": 18574 + }, + { + "epoch": 2.362930924818725, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.793875813484192, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8720722794532776, + "num_tokens": 708594586.0, + "step": 18575 + }, + { + "epoch": 2.363058135097316, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6415977478027344, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8790190815925598, + "num_tokens": 708634266.0, + "step": 18576 + }, + { + "epoch": 2.363185345375906, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6957383155822754, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.863845944404602, + "num_tokens": 708676188.0, + "step": 18577 + }, + { + "epoch": 2.3633125556544967, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5659090280532837, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8776251077651978, + "num_tokens": 708717409.0, + "step": 18578 + }, + { + "epoch": 2.3634397659330872, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.639039397239685, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8792175054550171, + "num_tokens": 708752511.0, + "step": 18579 + }, + { + "epoch": 2.3635669762116778, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6506716012954712, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8803398609161377, + "num_tokens": 708790750.0, + "step": 18580 + }, + { + "epoch": 2.3636941864902683, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.666843056678772, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8620619177818298, + "num_tokens": 708831068.0, + "step": 18581 + }, + { + "epoch": 2.363821396768859, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6937265396118164, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8663098216056824, + "num_tokens": 708868363.0, + "step": 18582 + }, + { + "epoch": 2.3639486070474494, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7348624467849731, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8730962872505188, + "num_tokens": 708902981.0, + "step": 18583 + }, + { + "epoch": 2.36407581732604, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5864083766937256, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8813387155532837, + "num_tokens": 708942817.0, + "step": 18584 + }, + { + "epoch": 2.3642030276046304, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6895123720169067, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8801140785217285, + "num_tokens": 708978480.0, + "step": 18585 + }, + { + "epoch": 2.364330237883221, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.818911075592041, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8748137354850769, + "num_tokens": 709012889.0, + "step": 18586 + }, + { + "epoch": 2.3644574481618115, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7138700485229492, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8827309608459473, + "num_tokens": 709048560.0, + "step": 18587 + }, + { + "epoch": 2.364584658440402, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.77958083152771, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8670657277107239, + "num_tokens": 709085816.0, + "step": 18588 + }, + { + "epoch": 2.3647118687189925, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6606061458587646, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8794528841972351, + "num_tokens": 709123128.0, + "step": 18589 + }, + { + "epoch": 2.364839078997583, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5686898231506348, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8683786392211914, + "num_tokens": 709167757.0, + "step": 18590 + }, + { + "epoch": 2.3649662892761736, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.759131908416748, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8783859610557556, + "num_tokens": 709204698.0, + "step": 18591 + }, + { + "epoch": 2.365093499554764, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6432597637176514, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8720439672470093, + "num_tokens": 709240437.0, + "step": 18592 + }, + { + "epoch": 2.3652207098333546, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5638108253479004, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8926714062690735, + "num_tokens": 709275928.0, + "step": 18593 + }, + { + "epoch": 2.365347920111945, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5848314762115479, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8832427263259888, + "num_tokens": 709314183.0, + "step": 18594 + }, + { + "epoch": 2.3654751303905357, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7015759944915771, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8668417930603027, + "num_tokens": 709354278.0, + "step": 18595 + }, + { + "epoch": 2.3656023406691262, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5561773777008057, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8804597854614258, + "num_tokens": 709395522.0, + "step": 18596 + }, + { + "epoch": 2.3657295509477168, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5399365425109863, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8812822699546814, + "num_tokens": 709436620.0, + "step": 18597 + }, + { + "epoch": 2.365856761226307, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7054378986358643, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8794567584991455, + "num_tokens": 709470369.0, + "step": 18598 + }, + { + "epoch": 2.365983971504898, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.656520128250122, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8785203099250793, + "num_tokens": 709508457.0, + "step": 18599 + }, + { + "epoch": 2.366111181783488, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7495685815811157, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8761124610900879, + "num_tokens": 709540705.0, + "step": 18600 + }, + { + "epoch": 2.3662383920620784, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.734161615371704, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8774068355560303, + "num_tokens": 709579244.0, + "step": 18601 + }, + { + "epoch": 2.366365602340669, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.740235447883606, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8740196228027344, + "num_tokens": 709620215.0, + "step": 18602 + }, + { + "epoch": 2.3664928126192595, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6834043264389038, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8760766983032227, + "num_tokens": 709658723.0, + "step": 18603 + }, + { + "epoch": 2.36662002289785, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5064475536346436, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8828545808792114, + "num_tokens": 709699989.0, + "step": 18604 + }, + { + "epoch": 2.3667472331764405, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5666584968566895, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.871981143951416, + "num_tokens": 709745349.0, + "step": 18605 + }, + { + "epoch": 2.366874443455031, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6814204454421997, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.881571888923645, + "num_tokens": 709782333.0, + "step": 18606 + }, + { + "epoch": 2.3670016537336216, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8347771167755127, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8802098035812378, + "num_tokens": 709813413.0, + "step": 18607 + }, + { + "epoch": 2.367128864012212, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.69669771194458, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8745712041854858, + "num_tokens": 709851128.0, + "step": 18608 + }, + { + "epoch": 2.3672560742908026, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5428341627120972, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8931121826171875, + "num_tokens": 709888608.0, + "step": 18609 + }, + { + "epoch": 2.367383284569393, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.671290397644043, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8792871236801147, + "num_tokens": 709924954.0, + "step": 18610 + }, + { + "epoch": 2.3675104948479837, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.567685842514038, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8568716049194336, + "num_tokens": 709973856.0, + "step": 18611 + }, + { + "epoch": 2.3676377051265742, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.675943374633789, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8740396499633789, + "num_tokens": 710013485.0, + "step": 18612 + }, + { + "epoch": 2.3677649154051648, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.733823299407959, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.864246129989624, + "num_tokens": 710050723.0, + "step": 18613 + }, + { + "epoch": 2.3678921256837553, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6957975625991821, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8566938638687134, + "num_tokens": 710088301.0, + "step": 18614 + }, + { + "epoch": 2.368019335962346, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.621651530265808, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8784350752830505, + "num_tokens": 710126442.0, + "step": 18615 + }, + { + "epoch": 2.3681465462409363, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6267119646072388, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8682461977005005, + "num_tokens": 710164957.0, + "step": 18616 + }, + { + "epoch": 2.368273756519527, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7272369861602783, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8874471187591553, + "num_tokens": 710198200.0, + "step": 18617 + }, + { + "epoch": 2.3684009667981174, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6120442152023315, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.870887041091919, + "num_tokens": 710238206.0, + "step": 18618 + }, + { + "epoch": 2.368528177076708, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.822304129600525, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8760198950767517, + "num_tokens": 710274156.0, + "step": 18619 + }, + { + "epoch": 2.3686553873552985, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7970380783081055, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8700956106185913, + "num_tokens": 710308818.0, + "step": 18620 + }, + { + "epoch": 2.368782597633889, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6937154531478882, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.868545413017273, + "num_tokens": 710351172.0, + "step": 18621 + }, + { + "epoch": 2.3689098079124795, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7205111980438232, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8572138547897339, + "num_tokens": 710391290.0, + "step": 18622 + }, + { + "epoch": 2.3690370181910696, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.828185796737671, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.8879061341285706, + "num_tokens": 710425293.0, + "step": 18623 + }, + { + "epoch": 2.3691642284696606, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6507905721664429, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8689901232719421, + "num_tokens": 710466407.0, + "step": 18624 + }, + { + "epoch": 2.3692914387482507, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6735646724700928, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8766958713531494, + "num_tokens": 710502560.0, + "step": 18625 + }, + { + "epoch": 2.369418649026841, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7622681856155396, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.870087742805481, + "num_tokens": 710541653.0, + "step": 18626 + }, + { + "epoch": 2.3695458593054317, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7205244302749634, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8759394884109497, + "num_tokens": 710579967.0, + "step": 18627 + }, + { + "epoch": 2.3696730695840222, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8253918886184692, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8688787817955017, + "num_tokens": 710613863.0, + "step": 18628 + }, + { + "epoch": 2.3698002798626128, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7223949432373047, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8704039454460144, + "num_tokens": 710651686.0, + "step": 18629 + }, + { + "epoch": 2.3699274901412033, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.669736385345459, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8753724098205566, + "num_tokens": 710687241.0, + "step": 18630 + }, + { + "epoch": 2.370054700419794, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6989067792892456, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8712046146392822, + "num_tokens": 710726132.0, + "step": 18631 + }, + { + "epoch": 2.3701819106983844, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 3.6356961727142334, + "learning_rate": 1e-06, + "loss": 0.3232, + "mean_token_accuracy": 0.8934520483016968, + "num_tokens": 710762884.0, + "step": 18632 + }, + { + "epoch": 2.370309120976975, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7542074918746948, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8891777396202087, + "num_tokens": 710800813.0, + "step": 18633 + }, + { + "epoch": 2.3704363312555654, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6402913331985474, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8718904256820679, + "num_tokens": 710840785.0, + "step": 18634 + }, + { + "epoch": 2.370563541534156, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5553501844406128, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8921465873718262, + "num_tokens": 710879025.0, + "step": 18635 + }, + { + "epoch": 2.3706907518127465, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5550751686096191, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8824354410171509, + "num_tokens": 710919747.0, + "step": 18636 + }, + { + "epoch": 2.370817962091337, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7433607578277588, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8743063807487488, + "num_tokens": 710958636.0, + "step": 18637 + }, + { + "epoch": 2.3709451723699275, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5908187627792358, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8796472549438477, + "num_tokens": 710995387.0, + "step": 18638 + }, + { + "epoch": 2.371072382648518, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 4.677611827850342, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8644559383392334, + "num_tokens": 711039743.0, + "step": 18639 + }, + { + "epoch": 2.3711995929271086, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5055961608886719, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8823649883270264, + "num_tokens": 711082914.0, + "step": 18640 + }, + { + "epoch": 2.371326803205699, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5623832941055298, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8805990219116211, + "num_tokens": 711123107.0, + "step": 18641 + }, + { + "epoch": 2.3714540134842896, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4847650527954102, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8874443769454956, + "num_tokens": 711162840.0, + "step": 18642 + }, + { + "epoch": 2.37158122376288, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5912048816680908, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8800687789916992, + "num_tokens": 711199313.0, + "step": 18643 + }, + { + "epoch": 2.3717084340414707, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.59189772605896, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8632688522338867, + "num_tokens": 711239926.0, + "step": 18644 + }, + { + "epoch": 2.371835644320061, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8254798650741577, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8907908797264099, + "num_tokens": 711267946.0, + "step": 18645 + }, + { + "epoch": 2.3719628545986517, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7909374237060547, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8943526744842529, + "num_tokens": 711298402.0, + "step": 18646 + }, + { + "epoch": 2.3720900648772423, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5472548007965088, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8773130774497986, + "num_tokens": 711341032.0, + "step": 18647 + }, + { + "epoch": 2.3722172751558324, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.478412389755249, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8816732168197632, + "num_tokens": 711387310.0, + "step": 18648 + }, + { + "epoch": 2.3723444854344233, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.627543330192566, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8720924854278564, + "num_tokens": 711432710.0, + "step": 18649 + }, + { + "epoch": 2.3724716957130134, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.557604432106018, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8785955309867859, + "num_tokens": 711473665.0, + "step": 18650 + }, + { + "epoch": 2.372598905991604, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7282544374465942, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8775627017021179, + "num_tokens": 711508682.0, + "step": 18651 + }, + { + "epoch": 2.3727261162701945, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6620256900787354, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8746461868286133, + "num_tokens": 711548036.0, + "step": 18652 + }, + { + "epoch": 2.372853326548785, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6364214420318604, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.889466404914856, + "num_tokens": 711586802.0, + "step": 18653 + }, + { + "epoch": 2.3729805368273755, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.661839485168457, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8662400245666504, + "num_tokens": 711625204.0, + "step": 18654 + }, + { + "epoch": 2.373107747105966, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7912241220474243, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8777176737785339, + "num_tokens": 711656413.0, + "step": 18655 + }, + { + "epoch": 2.3732349573845566, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8491164445877075, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8712617754936218, + "num_tokens": 711687567.0, + "step": 18656 + }, + { + "epoch": 2.373362167663147, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8157389163970947, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8687139749526978, + "num_tokens": 711722679.0, + "step": 18657 + }, + { + "epoch": 2.3734893779417376, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6926811933517456, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8657439351081848, + "num_tokens": 711766658.0, + "step": 18658 + }, + { + "epoch": 2.373616588220328, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.726788878440857, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.878437876701355, + "num_tokens": 711799371.0, + "step": 18659 + }, + { + "epoch": 2.3737437984989187, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7090976238250732, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8679538369178772, + "num_tokens": 711838752.0, + "step": 18660 + }, + { + "epoch": 2.3738710087775092, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8902744054794312, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8808406591415405, + "num_tokens": 711873308.0, + "step": 18661 + }, + { + "epoch": 2.3739982190560998, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6410810947418213, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8799515962600708, + "num_tokens": 711913093.0, + "step": 18662 + }, + { + "epoch": 2.3741254293346903, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6223996877670288, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.871734619140625, + "num_tokens": 711955401.0, + "step": 18663 + }, + { + "epoch": 2.374252639613281, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.724194884300232, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8870807886123657, + "num_tokens": 711988255.0, + "step": 18664 + }, + { + "epoch": 2.3743798498918713, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6012448072433472, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.884959876537323, + "num_tokens": 712032166.0, + "step": 18665 + }, + { + "epoch": 2.374507060170462, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.760933518409729, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.889290988445282, + "num_tokens": 712068167.0, + "step": 18666 + }, + { + "epoch": 2.3746342704490524, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.706929087638855, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8737369775772095, + "num_tokens": 712105819.0, + "step": 18667 + }, + { + "epoch": 2.374761480727643, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7387866973876953, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8618423938751221, + "num_tokens": 712143096.0, + "step": 18668 + }, + { + "epoch": 2.3748886910062335, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.635733723640442, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8702906370162964, + "num_tokens": 712184299.0, + "step": 18669 + }, + { + "epoch": 2.375015901284824, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.761953592300415, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.864680290222168, + "num_tokens": 712223294.0, + "step": 18670 + }, + { + "epoch": 2.3751431115634145, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5650964975357056, + "learning_rate": 1e-06, + "loss": 0.33, + "mean_token_accuracy": 0.8929091095924377, + "num_tokens": 712260558.0, + "step": 18671 + }, + { + "epoch": 2.375270321842005, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7186918258666992, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8652114868164062, + "num_tokens": 712301144.0, + "step": 18672 + }, + { + "epoch": 2.375397532120595, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5697643756866455, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8776057362556458, + "num_tokens": 712343350.0, + "step": 18673 + }, + { + "epoch": 2.375524742399186, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.65158212184906, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8841131925582886, + "num_tokens": 712380126.0, + "step": 18674 + }, + { + "epoch": 2.375651952677776, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7328897714614868, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8692281246185303, + "num_tokens": 712413894.0, + "step": 18675 + }, + { + "epoch": 2.3757791629563667, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6432863473892212, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8725235462188721, + "num_tokens": 712454240.0, + "step": 18676 + }, + { + "epoch": 2.3759063732349572, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5484802722930908, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8658339977264404, + "num_tokens": 712498093.0, + "step": 18677 + }, + { + "epoch": 2.3760335835135478, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.819298267364502, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8686397075653076, + "num_tokens": 712531296.0, + "step": 18678 + }, + { + "epoch": 2.3761607937921383, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.550073266029358, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8809897303581238, + "num_tokens": 712571335.0, + "step": 18679 + }, + { + "epoch": 2.376288004070729, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7607468366622925, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8549723625183105, + "num_tokens": 712609606.0, + "step": 18680 + }, + { + "epoch": 2.3764152143493193, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7671253681182861, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8837774991989136, + "num_tokens": 712642187.0, + "step": 18681 + }, + { + "epoch": 2.37654242462791, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.542263388633728, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8700734376907349, + "num_tokens": 712685798.0, + "step": 18682 + }, + { + "epoch": 2.3766696349065004, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5543036460876465, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8838258385658264, + "num_tokens": 712726617.0, + "step": 18683 + }, + { + "epoch": 2.376796845185091, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5615211725234985, + "learning_rate": 1e-06, + "loss": 0.3264, + "mean_token_accuracy": 0.8934228420257568, + "num_tokens": 712760955.0, + "step": 18684 + }, + { + "epoch": 2.3769240554636815, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.559872031211853, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8763477802276611, + "num_tokens": 712798980.0, + "step": 18685 + }, + { + "epoch": 2.377051265742272, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6885391473770142, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8732790946960449, + "num_tokens": 712835341.0, + "step": 18686 + }, + { + "epoch": 2.3771784760208625, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5576692819595337, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8741126656532288, + "num_tokens": 712873571.0, + "step": 18687 + }, + { + "epoch": 2.377305686299453, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.545548439025879, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8798885941505432, + "num_tokens": 712913644.0, + "step": 18688 + }, + { + "epoch": 2.3774328965780436, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.616505742073059, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8637169599533081, + "num_tokens": 712954962.0, + "step": 18689 + }, + { + "epoch": 2.377560106856634, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5544713735580444, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8846136331558228, + "num_tokens": 712997398.0, + "step": 18690 + }, + { + "epoch": 2.3776873171352246, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.740918517112732, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8634811639785767, + "num_tokens": 713032334.0, + "step": 18691 + }, + { + "epoch": 2.377814527413815, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5155726671218872, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8930174708366394, + "num_tokens": 713074078.0, + "step": 18692 + }, + { + "epoch": 2.3779417376924057, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5213861465454102, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8836389183998108, + "num_tokens": 713116478.0, + "step": 18693 + }, + { + "epoch": 2.378068947970996, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8159271478652954, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8644293546676636, + "num_tokens": 713150559.0, + "step": 18694 + }, + { + "epoch": 2.3781961582495867, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7306441068649292, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8602030277252197, + "num_tokens": 713187098.0, + "step": 18695 + }, + { + "epoch": 2.378323368528177, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6923856735229492, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.872941792011261, + "num_tokens": 713223917.0, + "step": 18696 + }, + { + "epoch": 2.378450578806768, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6810860633850098, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.878993034362793, + "num_tokens": 713263559.0, + "step": 18697 + }, + { + "epoch": 2.378577789085358, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5810110569000244, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8827605843544006, + "num_tokens": 713302975.0, + "step": 18698 + }, + { + "epoch": 2.3787049993639484, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6407822370529175, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.88376784324646, + "num_tokens": 713339003.0, + "step": 18699 + }, + { + "epoch": 2.378832209642539, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6548206806182861, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8750353455543518, + "num_tokens": 713375558.0, + "step": 18700 + }, + { + "epoch": 2.3789594199211295, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6199073791503906, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8607023358345032, + "num_tokens": 713415776.0, + "step": 18701 + }, + { + "epoch": 2.37908663019972, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6757816076278687, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8723315000534058, + "num_tokens": 713455898.0, + "step": 18702 + }, + { + "epoch": 2.3792138404783105, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5781209468841553, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8828042149543762, + "num_tokens": 713494634.0, + "step": 18703 + }, + { + "epoch": 2.379341050756901, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6195710897445679, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8743811249732971, + "num_tokens": 713532936.0, + "step": 18704 + }, + { + "epoch": 2.3794682610354916, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7448428869247437, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8731057643890381, + "num_tokens": 713570942.0, + "step": 18705 + }, + { + "epoch": 2.379595471314082, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6749552488327026, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8522465229034424, + "num_tokens": 713611707.0, + "step": 18706 + }, + { + "epoch": 2.3797226815926726, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5825965404510498, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.883995771408081, + "num_tokens": 713650742.0, + "step": 18707 + }, + { + "epoch": 2.379849891871263, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6221041679382324, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8879128694534302, + "num_tokens": 713686119.0, + "step": 18708 + }, + { + "epoch": 2.3799771021498537, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8284001350402832, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8624846935272217, + "num_tokens": 713721705.0, + "step": 18709 + }, + { + "epoch": 2.3801043124284442, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5277321338653564, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8679274320602417, + "num_tokens": 713766940.0, + "step": 18710 + }, + { + "epoch": 2.3802315227070348, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5602378845214844, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.867798388004303, + "num_tokens": 713805967.0, + "step": 18711 + }, + { + "epoch": 2.3803587329856253, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8861912488937378, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8729569315910339, + "num_tokens": 713836534.0, + "step": 18712 + }, + { + "epoch": 2.380485943264216, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7380478382110596, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8726305961608887, + "num_tokens": 713872537.0, + "step": 18713 + }, + { + "epoch": 2.3806131535428063, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6846230030059814, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8748618364334106, + "num_tokens": 713911976.0, + "step": 18714 + }, + { + "epoch": 2.380740363821397, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5956617593765259, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8630286455154419, + "num_tokens": 713953699.0, + "step": 18715 + }, + { + "epoch": 2.3808675740999874, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5984165668487549, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8762127161026001, + "num_tokens": 713994024.0, + "step": 18716 + }, + { + "epoch": 2.380994784378578, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.687408208847046, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.88634192943573, + "num_tokens": 714028013.0, + "step": 18717 + }, + { + "epoch": 2.3811219946571685, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7638047933578491, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.880942702293396, + "num_tokens": 714066881.0, + "step": 18718 + }, + { + "epoch": 2.381249204935759, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6613680124282837, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8705641031265259, + "num_tokens": 714106455.0, + "step": 18719 + }, + { + "epoch": 2.3813764152143495, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.640751838684082, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8817446231842041, + "num_tokens": 714143666.0, + "step": 18720 + }, + { + "epoch": 2.3815036254929396, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6459076404571533, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8838019371032715, + "num_tokens": 714179058.0, + "step": 18721 + }, + { + "epoch": 2.3816308357715306, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7101789712905884, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8790476322174072, + "num_tokens": 714217641.0, + "step": 18722 + }, + { + "epoch": 2.3817580460501206, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.9138144254684448, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8642261028289795, + "num_tokens": 714253151.0, + "step": 18723 + }, + { + "epoch": 2.381885256328711, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7314802408218384, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8776530027389526, + "num_tokens": 714290960.0, + "step": 18724 + }, + { + "epoch": 2.3820124666073017, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5757590532302856, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8694612979888916, + "num_tokens": 714328967.0, + "step": 18725 + }, + { + "epoch": 2.3821396768858922, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7434637546539307, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8772270679473877, + "num_tokens": 714364477.0, + "step": 18726 + }, + { + "epoch": 2.3822668871644828, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7571645975112915, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8621817827224731, + "num_tokens": 714409971.0, + "step": 18727 + }, + { + "epoch": 2.3823940974430733, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.661800503730774, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8806240558624268, + "num_tokens": 714450304.0, + "step": 18728 + }, + { + "epoch": 2.382521307721664, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5619630813598633, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8867495656013489, + "num_tokens": 714491875.0, + "step": 18729 + }, + { + "epoch": 2.3826485180002543, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6113996505737305, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8822489380836487, + "num_tokens": 714533929.0, + "step": 18730 + }, + { + "epoch": 2.382775728278845, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7272570133209229, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8781020045280457, + "num_tokens": 714566687.0, + "step": 18731 + }, + { + "epoch": 2.3829029385574354, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6077196598052979, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8712912797927856, + "num_tokens": 714607413.0, + "step": 18732 + }, + { + "epoch": 2.383030148836026, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8482162952423096, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8625942468643188, + "num_tokens": 714644917.0, + "step": 18733 + }, + { + "epoch": 2.3831573591146165, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6202774047851562, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8779774904251099, + "num_tokens": 714681600.0, + "step": 18734 + }, + { + "epoch": 2.383284569393207, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5732697248458862, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8803174495697021, + "num_tokens": 714719995.0, + "step": 18735 + }, + { + "epoch": 2.3834117796717975, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4977707862854004, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8801144361495972, + "num_tokens": 714763544.0, + "step": 18736 + }, + { + "epoch": 2.383538989950388, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8077223300933838, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.861423909664154, + "num_tokens": 714799221.0, + "step": 18737 + }, + { + "epoch": 2.3836662002289786, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6642669439315796, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8811757564544678, + "num_tokens": 714834582.0, + "step": 18738 + }, + { + "epoch": 2.383793410507569, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5217645168304443, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8675044775009155, + "num_tokens": 714876818.0, + "step": 18739 + }, + { + "epoch": 2.3839206207861596, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6872745752334595, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8610574007034302, + "num_tokens": 714916265.0, + "step": 18740 + }, + { + "epoch": 2.38404783106475, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7778514623641968, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8671025037765503, + "num_tokens": 714950307.0, + "step": 18741 + }, + { + "epoch": 2.3841750413433407, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6862704753875732, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8891108632087708, + "num_tokens": 714984166.0, + "step": 18742 + }, + { + "epoch": 2.384302251621931, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6066161394119263, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8670780658721924, + "num_tokens": 715025560.0, + "step": 18743 + }, + { + "epoch": 2.3844294619005217, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8256165981292725, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8516735434532166, + "num_tokens": 715059291.0, + "step": 18744 + }, + { + "epoch": 2.3845566721791123, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7537215948104858, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.86562579870224, + "num_tokens": 715097840.0, + "step": 18745 + }, + { + "epoch": 2.3846838824577024, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7804064750671387, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8747619986534119, + "num_tokens": 715134133.0, + "step": 18746 + }, + { + "epoch": 2.3848110927362933, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.608119010925293, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8821763396263123, + "num_tokens": 715170115.0, + "step": 18747 + }, + { + "epoch": 2.3849383030148834, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6650638580322266, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8703402280807495, + "num_tokens": 715205844.0, + "step": 18748 + }, + { + "epoch": 2.385065513293474, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6625618934631348, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8664844632148743, + "num_tokens": 715244736.0, + "step": 18749 + }, + { + "epoch": 2.3851927235720645, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7924271821975708, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8762181997299194, + "num_tokens": 715282868.0, + "step": 18750 + }, + { + "epoch": 2.385319933850655, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.659501552581787, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8603363633155823, + "num_tokens": 715328894.0, + "step": 18751 + }, + { + "epoch": 2.3854471441292455, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.763838529586792, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8810025453567505, + "num_tokens": 715365820.0, + "step": 18752 + }, + { + "epoch": 2.385574354407836, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5391887426376343, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8741496801376343, + "num_tokens": 715409295.0, + "step": 18753 + }, + { + "epoch": 2.3857015646864266, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.670318841934204, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8753024339675903, + "num_tokens": 715448016.0, + "step": 18754 + }, + { + "epoch": 2.385828774965017, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6748477220535278, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8785062432289124, + "num_tokens": 715483734.0, + "step": 18755 + }, + { + "epoch": 2.3859559852436076, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6105360984802246, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8728383183479309, + "num_tokens": 715523274.0, + "step": 18756 + }, + { + "epoch": 2.386083195522198, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8107534646987915, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8856955766677856, + "num_tokens": 715558467.0, + "step": 18757 + }, + { + "epoch": 2.3862104058007887, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6837215423583984, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8824605941772461, + "num_tokens": 715594260.0, + "step": 18758 + }, + { + "epoch": 2.386337616079379, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7859175205230713, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8786545395851135, + "num_tokens": 715632008.0, + "step": 18759 + }, + { + "epoch": 2.3864648263579697, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5094815492630005, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8828385472297668, + "num_tokens": 715674416.0, + "step": 18760 + }, + { + "epoch": 2.3865920366365603, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8591517210006714, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8724827766418457, + "num_tokens": 715706953.0, + "step": 18761 + }, + { + "epoch": 2.386719246915151, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.770898699760437, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.881100058555603, + "num_tokens": 715738652.0, + "step": 18762 + }, + { + "epoch": 2.3868464571937413, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4476391077041626, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8717391490936279, + "num_tokens": 715790951.0, + "step": 18763 + }, + { + "epoch": 2.386973667472332, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.602178692817688, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8856056928634644, + "num_tokens": 715826584.0, + "step": 18764 + }, + { + "epoch": 2.3871008777509224, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6092292070388794, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8759942054748535, + "num_tokens": 715866188.0, + "step": 18765 + }, + { + "epoch": 2.387228088029513, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6886883974075317, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8632009625434875, + "num_tokens": 715907765.0, + "step": 18766 + }, + { + "epoch": 2.3873552983081034, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6124157905578613, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8760032653808594, + "num_tokens": 715945641.0, + "step": 18767 + }, + { + "epoch": 2.387482508586694, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5659806728363037, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8737938404083252, + "num_tokens": 715987028.0, + "step": 18768 + }, + { + "epoch": 2.3876097188652845, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.789320945739746, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8661437630653381, + "num_tokens": 716022094.0, + "step": 18769 + }, + { + "epoch": 2.387736929143875, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6638661623001099, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8532564043998718, + "num_tokens": 716061732.0, + "step": 18770 + }, + { + "epoch": 2.387864139422465, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.570505976676941, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8721532225608826, + "num_tokens": 716102180.0, + "step": 18771 + }, + { + "epoch": 2.387991349701056, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6351958513259888, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8763586282730103, + "num_tokens": 716142225.0, + "step": 18772 + }, + { + "epoch": 2.388118559979646, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5045740604400635, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8933708667755127, + "num_tokens": 716177770.0, + "step": 18773 + }, + { + "epoch": 2.3882457702582367, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.613525629043579, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8605154752731323, + "num_tokens": 716220663.0, + "step": 18774 + }, + { + "epoch": 2.3883729805368272, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5982873439788818, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8823147416114807, + "num_tokens": 716257498.0, + "step": 18775 + }, + { + "epoch": 2.3885001908154178, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6696646213531494, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8680384755134583, + "num_tokens": 716298944.0, + "step": 18776 + }, + { + "epoch": 2.3886274010940083, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8679560422897339, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8698810935020447, + "num_tokens": 716333318.0, + "step": 18777 + }, + { + "epoch": 2.388754611372599, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.832997441291809, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8543314337730408, + "num_tokens": 716371988.0, + "step": 18778 + }, + { + "epoch": 2.3888818216511893, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7607226371765137, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8812416791915894, + "num_tokens": 716406833.0, + "step": 18779 + }, + { + "epoch": 2.38900903192978, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6445132493972778, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8772958517074585, + "num_tokens": 716442424.0, + "step": 18780 + }, + { + "epoch": 2.3891362422083704, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.701996088027954, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8805361986160278, + "num_tokens": 716474696.0, + "step": 18781 + }, + { + "epoch": 2.389263452486961, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6060824394226074, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8851890563964844, + "num_tokens": 716516561.0, + "step": 18782 + }, + { + "epoch": 2.3893906627655515, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.680860996246338, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8725391626358032, + "num_tokens": 716551875.0, + "step": 18783 + }, + { + "epoch": 2.389517873044142, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.660669207572937, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8680769205093384, + "num_tokens": 716589585.0, + "step": 18784 + }, + { + "epoch": 2.3896450833227325, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6057472229003906, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8636109232902527, + "num_tokens": 716632415.0, + "step": 18785 + }, + { + "epoch": 2.389772293601323, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6619329452514648, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8844600915908813, + "num_tokens": 716670673.0, + "step": 18786 + }, + { + "epoch": 2.3898995038799136, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6663439273834229, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8779264688491821, + "num_tokens": 716708797.0, + "step": 18787 + }, + { + "epoch": 2.390026714158504, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5122859477996826, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8770493268966675, + "num_tokens": 716751777.0, + "step": 18788 + }, + { + "epoch": 2.3901539244370946, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6190712451934814, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8782923221588135, + "num_tokens": 716788092.0, + "step": 18789 + }, + { + "epoch": 2.390281134715685, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6870536804199219, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8781626224517822, + "num_tokens": 716823446.0, + "step": 18790 + }, + { + "epoch": 2.3904083449942757, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8036941289901733, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.872490406036377, + "num_tokens": 716858920.0, + "step": 18791 + }, + { + "epoch": 2.390535555272866, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5681458711624146, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8598414659500122, + "num_tokens": 716902257.0, + "step": 18792 + }, + { + "epoch": 2.3906627655514567, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.530629277229309, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8777600526809692, + "num_tokens": 716943327.0, + "step": 18793 + }, + { + "epoch": 2.390789975830047, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5580329895019531, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8735841512680054, + "num_tokens": 716982964.0, + "step": 18794 + }, + { + "epoch": 2.390917186108638, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.538389801979065, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8647571206092834, + "num_tokens": 717026735.0, + "step": 18795 + }, + { + "epoch": 2.391044396387228, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5904383659362793, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8788719773292542, + "num_tokens": 717064615.0, + "step": 18796 + }, + { + "epoch": 2.3911716066658184, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.700468897819519, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8724116086959839, + "num_tokens": 717102144.0, + "step": 18797 + }, + { + "epoch": 2.391298816944409, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5755873918533325, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8713370561599731, + "num_tokens": 717140436.0, + "step": 18798 + }, + { + "epoch": 2.3914260272229995, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.647538423538208, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8695453405380249, + "num_tokens": 717181684.0, + "step": 18799 + }, + { + "epoch": 2.39155323750159, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6734572649002075, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8646950721740723, + "num_tokens": 717220125.0, + "step": 18800 + }, + { + "epoch": 2.3916804477801805, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5700006484985352, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8720502853393555, + "num_tokens": 717262951.0, + "step": 18801 + }, + { + "epoch": 2.391807658058771, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7349200248718262, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8650259971618652, + "num_tokens": 717295814.0, + "step": 18802 + }, + { + "epoch": 2.3919348683373616, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.600712537765503, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.872920572757721, + "num_tokens": 717329492.0, + "step": 18803 + }, + { + "epoch": 2.392062078615952, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5709550380706787, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8795109987258911, + "num_tokens": 717368070.0, + "step": 18804 + }, + { + "epoch": 2.3921892888945426, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.660696268081665, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8687894940376282, + "num_tokens": 717408431.0, + "step": 18805 + }, + { + "epoch": 2.392316499173133, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5781000852584839, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8632768392562866, + "num_tokens": 717453380.0, + "step": 18806 + }, + { + "epoch": 2.3924437094517237, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6515907049179077, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8651622533798218, + "num_tokens": 717491628.0, + "step": 18807 + }, + { + "epoch": 2.392570919730314, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5800414085388184, + "learning_rate": 1e-06, + "loss": 0.3418, + "mean_token_accuracy": 0.8898541927337646, + "num_tokens": 717530132.0, + "step": 18808 + }, + { + "epoch": 2.3926981300089047, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7715988159179688, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8631364107131958, + "num_tokens": 717564114.0, + "step": 18809 + }, + { + "epoch": 2.3928253402874953, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.568933129310608, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.8903053998947144, + "num_tokens": 717605478.0, + "step": 18810 + }, + { + "epoch": 2.392952550566086, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.666231393814087, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8749449253082275, + "num_tokens": 717643889.0, + "step": 18811 + }, + { + "epoch": 2.3930797608446763, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.9096797704696655, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8766127824783325, + "num_tokens": 717675824.0, + "step": 18812 + }, + { + "epoch": 2.393206971123267, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6682153940200806, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8823745846748352, + "num_tokens": 717714081.0, + "step": 18813 + }, + { + "epoch": 2.3933341814018574, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.655514121055603, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8747177124023438, + "num_tokens": 717756482.0, + "step": 18814 + }, + { + "epoch": 2.393461391680448, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.801034927368164, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.861336350440979, + "num_tokens": 717795042.0, + "step": 18815 + }, + { + "epoch": 2.3935886019590384, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6558458805084229, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8699750304222107, + "num_tokens": 717836088.0, + "step": 18816 + }, + { + "epoch": 2.393715812237629, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5685197114944458, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8952599763870239, + "num_tokens": 717877762.0, + "step": 18817 + }, + { + "epoch": 2.3938430225162195, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6133404970169067, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8790538311004639, + "num_tokens": 717915628.0, + "step": 18818 + }, + { + "epoch": 2.3939702327948096, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 2.000513792037964, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8678772449493408, + "num_tokens": 717946940.0, + "step": 18819 + }, + { + "epoch": 2.3940974430734006, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6253211498260498, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.877917468547821, + "num_tokens": 717990230.0, + "step": 18820 + }, + { + "epoch": 2.3942246533519906, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7068250179290771, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8697542548179626, + "num_tokens": 718028704.0, + "step": 18821 + }, + { + "epoch": 2.394351863630581, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8333083391189575, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8801336288452148, + "num_tokens": 718061332.0, + "step": 18822 + }, + { + "epoch": 2.3944790739091717, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6437292098999023, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8815218210220337, + "num_tokens": 718098980.0, + "step": 18823 + }, + { + "epoch": 2.3946062841877622, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6761665344238281, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8802222013473511, + "num_tokens": 718135999.0, + "step": 18824 + }, + { + "epoch": 2.3947334944663528, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5850868225097656, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8874691724777222, + "num_tokens": 718173756.0, + "step": 18825 + }, + { + "epoch": 2.3948607047449433, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7213127613067627, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8593904972076416, + "num_tokens": 718212615.0, + "step": 18826 + }, + { + "epoch": 2.394987915023534, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6808658838272095, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.875054121017456, + "num_tokens": 718254136.0, + "step": 18827 + }, + { + "epoch": 2.3951151253021243, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6885004043579102, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8840774297714233, + "num_tokens": 718291925.0, + "step": 18828 + }, + { + "epoch": 2.395242335580715, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.790765643119812, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8645222187042236, + "num_tokens": 718330080.0, + "step": 18829 + }, + { + "epoch": 2.3953695458593054, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5717707872390747, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8694831728935242, + "num_tokens": 718372658.0, + "step": 18830 + }, + { + "epoch": 2.395496756137896, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5405088663101196, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8828859329223633, + "num_tokens": 718417213.0, + "step": 18831 + }, + { + "epoch": 2.3956239664164865, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7746031284332275, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8741886615753174, + "num_tokens": 718451626.0, + "step": 18832 + }, + { + "epoch": 2.395751176695077, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.668806552886963, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8592827320098877, + "num_tokens": 718492392.0, + "step": 18833 + }, + { + "epoch": 2.3958783869736675, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5350866317749023, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8699707388877869, + "num_tokens": 718534727.0, + "step": 18834 + }, + { + "epoch": 2.396005597252258, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7323349714279175, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8729922771453857, + "num_tokens": 718571899.0, + "step": 18835 + }, + { + "epoch": 2.3961328075308486, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5359605550765991, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8742974400520325, + "num_tokens": 718613685.0, + "step": 18836 + }, + { + "epoch": 2.396260017809439, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5226322412490845, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8893476128578186, + "num_tokens": 718654095.0, + "step": 18837 + }, + { + "epoch": 2.3963872280880296, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6257879734039307, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8730242252349854, + "num_tokens": 718692441.0, + "step": 18838 + }, + { + "epoch": 2.39651443836662, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4843785762786865, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8865720629692078, + "num_tokens": 718734986.0, + "step": 18839 + }, + { + "epoch": 2.3966416486452107, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.689605712890625, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8595319986343384, + "num_tokens": 718771170.0, + "step": 18840 + }, + { + "epoch": 2.396768858923801, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.521842360496521, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8834161758422852, + "num_tokens": 718811854.0, + "step": 18841 + }, + { + "epoch": 2.3968960692023917, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5753254890441895, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8775988817214966, + "num_tokens": 718848934.0, + "step": 18842 + }, + { + "epoch": 2.3970232794809823, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7407679557800293, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8709627389907837, + "num_tokens": 718884913.0, + "step": 18843 + }, + { + "epoch": 2.3971504897595723, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6750967502593994, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8585128784179688, + "num_tokens": 718923293.0, + "step": 18844 + }, + { + "epoch": 2.3972777000381633, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5066417455673218, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8832771182060242, + "num_tokens": 718964105.0, + "step": 18845 + }, + { + "epoch": 2.3974049103167534, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.612016201019287, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.874018132686615, + "num_tokens": 719006800.0, + "step": 18846 + }, + { + "epoch": 2.397532120595344, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7907202243804932, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8826155066490173, + "num_tokens": 719038507.0, + "step": 18847 + }, + { + "epoch": 2.3976593308739345, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.812802791595459, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8713592290878296, + "num_tokens": 719072259.0, + "step": 18848 + }, + { + "epoch": 2.397786541152525, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5704224109649658, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8765624761581421, + "num_tokens": 719116972.0, + "step": 18849 + }, + { + "epoch": 2.3979137514311155, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6263982057571411, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8868067264556885, + "num_tokens": 719155683.0, + "step": 18850 + }, + { + "epoch": 2.398040961709706, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6335481405258179, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8588437438011169, + "num_tokens": 719197344.0, + "step": 18851 + }, + { + "epoch": 2.3981681719882966, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.533286452293396, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8870593309402466, + "num_tokens": 719238624.0, + "step": 18852 + }, + { + "epoch": 2.398295382266887, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8187992572784424, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8661525249481201, + "num_tokens": 719269644.0, + "step": 18853 + }, + { + "epoch": 2.3984225925454776, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5445648431777954, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8777791261672974, + "num_tokens": 719311630.0, + "step": 18854 + }, + { + "epoch": 2.398549802824068, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6499232053756714, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8813186883926392, + "num_tokens": 719350019.0, + "step": 18855 + }, + { + "epoch": 2.3986770131026587, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7158212661743164, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8492175340652466, + "num_tokens": 719391155.0, + "step": 18856 + }, + { + "epoch": 2.398804223381249, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6638147830963135, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8752140998840332, + "num_tokens": 719426808.0, + "step": 18857 + }, + { + "epoch": 2.3989314336598397, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7276653051376343, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8733842372894287, + "num_tokens": 719463641.0, + "step": 18858 + }, + { + "epoch": 2.3990586439384303, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.9345426559448242, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8791499137878418, + "num_tokens": 719492970.0, + "step": 18859 + }, + { + "epoch": 2.399185854217021, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6039247512817383, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8667439222335815, + "num_tokens": 719538517.0, + "step": 18860 + }, + { + "epoch": 2.3993130644956113, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6902101039886475, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8698933720588684, + "num_tokens": 719576786.0, + "step": 18861 + }, + { + "epoch": 2.399440274774202, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5757782459259033, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8539371490478516, + "num_tokens": 719623513.0, + "step": 18862 + }, + { + "epoch": 2.3995674850527924, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7426798343658447, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8630937933921814, + "num_tokens": 719657775.0, + "step": 18863 + }, + { + "epoch": 2.399694695331383, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6384466886520386, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8776944279670715, + "num_tokens": 719697633.0, + "step": 18864 + }, + { + "epoch": 2.3998219056099734, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6374146938323975, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.885482907295227, + "num_tokens": 719736821.0, + "step": 18865 + }, + { + "epoch": 2.399949115888564, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6051832437515259, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8722480535507202, + "num_tokens": 719775117.0, + "step": 18866 + }, + { + "epoch": 2.4000763261671545, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.648862600326538, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8741223216056824, + "num_tokens": 719810298.0, + "step": 18867 + }, + { + "epoch": 2.400203536445745, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5723903179168701, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8744958639144897, + "num_tokens": 719851868.0, + "step": 18868 + }, + { + "epoch": 2.400330746724335, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6171332597732544, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8870761394500732, + "num_tokens": 719888238.0, + "step": 18869 + }, + { + "epoch": 2.400457957002926, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5672705173492432, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.876846432685852, + "num_tokens": 719928724.0, + "step": 18870 + }, + { + "epoch": 2.400585167281516, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6270382404327393, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8733436465263367, + "num_tokens": 719965890.0, + "step": 18871 + }, + { + "epoch": 2.4007123775601067, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5004072189331055, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8753141164779663, + "num_tokens": 720008086.0, + "step": 18872 + }, + { + "epoch": 2.400839587838697, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4721430540084839, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8863878846168518, + "num_tokens": 720051774.0, + "step": 18873 + }, + { + "epoch": 2.4009667981172877, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.667829990386963, + "learning_rate": 1e-06, + "loss": 0.3324, + "mean_token_accuracy": 0.8908206224441528, + "num_tokens": 720085139.0, + "step": 18874 + }, + { + "epoch": 2.4010940083958783, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.721376895904541, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8795771598815918, + "num_tokens": 720120045.0, + "step": 18875 + }, + { + "epoch": 2.401221218674469, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.783949375152588, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8530054092407227, + "num_tokens": 720153507.0, + "step": 18876 + }, + { + "epoch": 2.4013484289530593, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7025516033172607, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8786427974700928, + "num_tokens": 720189062.0, + "step": 18877 + }, + { + "epoch": 2.40147563923165, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.491716980934143, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8800222873687744, + "num_tokens": 720233356.0, + "step": 18878 + }, + { + "epoch": 2.4016028495102404, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8045501708984375, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8712541460990906, + "num_tokens": 720269317.0, + "step": 18879 + }, + { + "epoch": 2.401730059788831, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5675140619277954, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8751776814460754, + "num_tokens": 720310976.0, + "step": 18880 + }, + { + "epoch": 2.4018572700674214, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.547216773033142, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8736844062805176, + "num_tokens": 720351818.0, + "step": 18881 + }, + { + "epoch": 2.401984480346012, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6866281032562256, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.880782961845398, + "num_tokens": 720386091.0, + "step": 18882 + }, + { + "epoch": 2.4021116906246025, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7038025856018066, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8602669835090637, + "num_tokens": 720424376.0, + "step": 18883 + }, + { + "epoch": 2.402238900903193, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.608773112297058, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8776742219924927, + "num_tokens": 720462834.0, + "step": 18884 + }, + { + "epoch": 2.4023661111817836, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.756353735923767, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8774287700653076, + "num_tokens": 720496272.0, + "step": 18885 + }, + { + "epoch": 2.402493321460374, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7255009412765503, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8772967457771301, + "num_tokens": 720535875.0, + "step": 18886 + }, + { + "epoch": 2.4026205317389646, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8765228986740112, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8692867755889893, + "num_tokens": 720570231.0, + "step": 18887 + }, + { + "epoch": 2.402747742017555, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.509624719619751, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.878818154335022, + "num_tokens": 720615113.0, + "step": 18888 + }, + { + "epoch": 2.4028749522961457, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5729302167892456, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8693932890892029, + "num_tokens": 720655777.0, + "step": 18889 + }, + { + "epoch": 2.403002162574736, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.664298415184021, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8725073933601379, + "num_tokens": 720695188.0, + "step": 18890 + }, + { + "epoch": 2.4031293728533267, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5705337524414062, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8810855150222778, + "num_tokens": 720738273.0, + "step": 18891 + }, + { + "epoch": 2.403256583131917, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.665229320526123, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8822602033615112, + "num_tokens": 720774625.0, + "step": 18892 + }, + { + "epoch": 2.403383793410508, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5875794887542725, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8868492841720581, + "num_tokens": 720812770.0, + "step": 18893 + }, + { + "epoch": 2.403511003689098, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8002973794937134, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8691257238388062, + "num_tokens": 720847964.0, + "step": 18894 + }, + { + "epoch": 2.4036382139676884, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.743901252746582, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8759558200836182, + "num_tokens": 720884234.0, + "step": 18895 + }, + { + "epoch": 2.403765424246279, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.778607964515686, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.858212947845459, + "num_tokens": 720920410.0, + "step": 18896 + }, + { + "epoch": 2.4038926345248695, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6304821968078613, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8845458030700684, + "num_tokens": 720957834.0, + "step": 18897 + }, + { + "epoch": 2.40401984480346, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8116682767868042, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8896182179450989, + "num_tokens": 720994115.0, + "step": 18898 + }, + { + "epoch": 2.4041470550820505, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6390187740325928, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8691275715827942, + "num_tokens": 721031711.0, + "step": 18899 + }, + { + "epoch": 2.404274265360641, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8078895807266235, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8683028817176819, + "num_tokens": 721066142.0, + "step": 18900 + }, + { + "epoch": 2.4044014756392316, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.436783790588379, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8880089521408081, + "num_tokens": 721109297.0, + "step": 18901 + }, + { + "epoch": 2.404528685917822, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.630403757095337, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8743000626564026, + "num_tokens": 721150497.0, + "step": 18902 + }, + { + "epoch": 2.4046558961964126, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6714845895767212, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8809882402420044, + "num_tokens": 721188041.0, + "step": 18903 + }, + { + "epoch": 2.404783106475003, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7064684629440308, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8719627857208252, + "num_tokens": 721227260.0, + "step": 18904 + }, + { + "epoch": 2.4049103167535937, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7895029783248901, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8635333776473999, + "num_tokens": 721261264.0, + "step": 18905 + }, + { + "epoch": 2.405037527032184, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.656890869140625, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8728419542312622, + "num_tokens": 721304642.0, + "step": 18906 + }, + { + "epoch": 2.4051647373107747, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.812446117401123, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8609240651130676, + "num_tokens": 721341236.0, + "step": 18907 + }, + { + "epoch": 2.4052919475893653, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7230664491653442, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8774625062942505, + "num_tokens": 721373615.0, + "step": 18908 + }, + { + "epoch": 2.405419157867956, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5795918703079224, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8849965333938599, + "num_tokens": 721413037.0, + "step": 18909 + }, + { + "epoch": 2.4055463681465463, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5910860300064087, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8931491374969482, + "num_tokens": 721447211.0, + "step": 18910 + }, + { + "epoch": 2.405673578425137, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5604023933410645, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.877447247505188, + "num_tokens": 721489315.0, + "step": 18911 + }, + { + "epoch": 2.4058007887037274, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6571513414382935, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8767911195755005, + "num_tokens": 721528251.0, + "step": 18912 + }, + { + "epoch": 2.405927998982318, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.689875602722168, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.890771210193634, + "num_tokens": 721562183.0, + "step": 18913 + }, + { + "epoch": 2.4060552092609084, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4461636543273926, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8796824812889099, + "num_tokens": 721606944.0, + "step": 18914 + }, + { + "epoch": 2.406182419539499, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6381831169128418, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8712496757507324, + "num_tokens": 721644631.0, + "step": 18915 + }, + { + "epoch": 2.4063096298180895, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5435749292373657, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8859157562255859, + "num_tokens": 721687687.0, + "step": 18916 + }, + { + "epoch": 2.4064368400966796, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6610268354415894, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.882784366607666, + "num_tokens": 721725681.0, + "step": 18917 + }, + { + "epoch": 2.4065640503752705, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.710479736328125, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.881803035736084, + "num_tokens": 721758181.0, + "step": 18918 + }, + { + "epoch": 2.4066912606538606, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7178078889846802, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8692550659179688, + "num_tokens": 721793002.0, + "step": 18919 + }, + { + "epoch": 2.406818470932451, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6330463886260986, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8787124752998352, + "num_tokens": 721829832.0, + "step": 18920 + }, + { + "epoch": 2.4069456812110417, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6275672912597656, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8611562252044678, + "num_tokens": 721872731.0, + "step": 18921 + }, + { + "epoch": 2.407072891489632, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.699142575263977, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8664515018463135, + "num_tokens": 721907288.0, + "step": 18922 + }, + { + "epoch": 2.4072001017682227, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6946957111358643, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8801244497299194, + "num_tokens": 721944362.0, + "step": 18923 + }, + { + "epoch": 2.4073273120468133, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7141532897949219, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8800897598266602, + "num_tokens": 721976895.0, + "step": 18924 + }, + { + "epoch": 2.407454522325404, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.980563998222351, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8795640468597412, + "num_tokens": 722002400.0, + "step": 18925 + }, + { + "epoch": 2.4075817326039943, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7473949193954468, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8700330853462219, + "num_tokens": 722036784.0, + "step": 18926 + }, + { + "epoch": 2.407708942882585, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5937589406967163, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8832665085792542, + "num_tokens": 722078032.0, + "step": 18927 + }, + { + "epoch": 2.4078361531611754, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7084832191467285, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8780779838562012, + "num_tokens": 722113829.0, + "step": 18928 + }, + { + "epoch": 2.407963363439766, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5723810195922852, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8614519834518433, + "num_tokens": 722158062.0, + "step": 18929 + }, + { + "epoch": 2.4080905737183564, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6833407878875732, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8645009994506836, + "num_tokens": 722195081.0, + "step": 18930 + }, + { + "epoch": 2.408217783996947, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6045095920562744, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8707541823387146, + "num_tokens": 722231714.0, + "step": 18931 + }, + { + "epoch": 2.4083449942755375, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6776677370071411, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8829891681671143, + "num_tokens": 722271211.0, + "step": 18932 + }, + { + "epoch": 2.408472204554128, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7766671180725098, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8720378875732422, + "num_tokens": 722307543.0, + "step": 18933 + }, + { + "epoch": 2.4085994148327186, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5396435260772705, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8742449879646301, + "num_tokens": 722350718.0, + "step": 18934 + }, + { + "epoch": 2.408726625111309, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.435543179512024, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8768381476402283, + "num_tokens": 722400051.0, + "step": 18935 + }, + { + "epoch": 2.4088538353898996, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.50918710231781, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8999999761581421, + "num_tokens": 722434664.0, + "step": 18936 + }, + { + "epoch": 2.40898104566849, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5664772987365723, + "learning_rate": 1e-06, + "loss": 0.333, + "mean_token_accuracy": 0.8906896114349365, + "num_tokens": 722480485.0, + "step": 18937 + }, + { + "epoch": 2.4091082559470807, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8489041328430176, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.870886504650116, + "num_tokens": 722513814.0, + "step": 18938 + }, + { + "epoch": 2.409235466225671, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.626275658607483, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8784778118133545, + "num_tokens": 722552213.0, + "step": 18939 + }, + { + "epoch": 2.4093626765042617, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7434618473052979, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8855726718902588, + "num_tokens": 722585515.0, + "step": 18940 + }, + { + "epoch": 2.4094898867828523, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5413618087768555, + "learning_rate": 1e-06, + "loss": 0.3285, + "mean_token_accuracy": 0.893254280090332, + "num_tokens": 722626124.0, + "step": 18941 + }, + { + "epoch": 2.4096170970614423, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4209866523742676, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8841320276260376, + "num_tokens": 722671393.0, + "step": 18942 + }, + { + "epoch": 2.4097443073400333, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8020833730697632, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8819171190261841, + "num_tokens": 722702005.0, + "step": 18943 + }, + { + "epoch": 2.4098715176186234, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8180580139160156, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8646181225776672, + "num_tokens": 722742330.0, + "step": 18944 + }, + { + "epoch": 2.409998727897214, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.692604660987854, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8905606865882874, + "num_tokens": 722776183.0, + "step": 18945 + }, + { + "epoch": 2.4101259381758044, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5266435146331787, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8779121041297913, + "num_tokens": 722819749.0, + "step": 18946 + }, + { + "epoch": 2.410253148454395, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.585762858390808, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8626903891563416, + "num_tokens": 722865164.0, + "step": 18947 + }, + { + "epoch": 2.4103803587329855, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4528286457061768, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8776809573173523, + "num_tokens": 722910496.0, + "step": 18948 + }, + { + "epoch": 2.410507569011576, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.692307949066162, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8730456233024597, + "num_tokens": 722948021.0, + "step": 18949 + }, + { + "epoch": 2.4106347792901666, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6791611909866333, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8788171410560608, + "num_tokens": 722982689.0, + "step": 18950 + }, + { + "epoch": 2.410761989568757, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6917136907577515, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8620026707649231, + "num_tokens": 723020906.0, + "step": 18951 + }, + { + "epoch": 2.4108891998473476, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.887500286102295, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8766365647315979, + "num_tokens": 723048681.0, + "step": 18952 + }, + { + "epoch": 2.411016410125938, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.745495080947876, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8529886603355408, + "num_tokens": 723086956.0, + "step": 18953 + }, + { + "epoch": 2.4111436204045287, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7140872478485107, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8719276189804077, + "num_tokens": 723121519.0, + "step": 18954 + }, + { + "epoch": 2.411270830683119, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5954055786132812, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8691946268081665, + "num_tokens": 723163698.0, + "step": 18955 + }, + { + "epoch": 2.4113980409617097, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8066446781158447, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8561831712722778, + "num_tokens": 723197692.0, + "step": 18956 + }, + { + "epoch": 2.4115252512403003, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.70095956325531, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.882260799407959, + "num_tokens": 723230174.0, + "step": 18957 + }, + { + "epoch": 2.411652461518891, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6044504642486572, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8732506036758423, + "num_tokens": 723269644.0, + "step": 18958 + }, + { + "epoch": 2.4117796717974813, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5842901468276978, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8800224661827087, + "num_tokens": 723310687.0, + "step": 18959 + }, + { + "epoch": 2.411906882076072, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6606996059417725, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8717689514160156, + "num_tokens": 723351758.0, + "step": 18960 + }, + { + "epoch": 2.4120340923546624, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.653802514076233, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8748623132705688, + "num_tokens": 723390723.0, + "step": 18961 + }, + { + "epoch": 2.412161302633253, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6928476095199585, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8599926829338074, + "num_tokens": 723430479.0, + "step": 18962 + }, + { + "epoch": 2.4122885129118434, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7846788167953491, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8792744874954224, + "num_tokens": 723465510.0, + "step": 18963 + }, + { + "epoch": 2.412415723190434, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6869443655014038, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8600569367408752, + "num_tokens": 723504819.0, + "step": 18964 + }, + { + "epoch": 2.4125429334690245, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6890308856964111, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8632394671440125, + "num_tokens": 723543326.0, + "step": 18965 + }, + { + "epoch": 2.412670143747615, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5893139839172363, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8720223307609558, + "num_tokens": 723581400.0, + "step": 18966 + }, + { + "epoch": 2.412797354026205, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.618369460105896, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8677864074707031, + "num_tokens": 723618938.0, + "step": 18967 + }, + { + "epoch": 2.412924564304796, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.594061017036438, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8888177871704102, + "num_tokens": 723654698.0, + "step": 18968 + }, + { + "epoch": 2.413051774583386, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6891838312149048, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8771064877510071, + "num_tokens": 723691809.0, + "step": 18969 + }, + { + "epoch": 2.4131789848619767, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6668775081634521, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8766607046127319, + "num_tokens": 723727454.0, + "step": 18970 + }, + { + "epoch": 2.413306195140567, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5878510475158691, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8735779523849487, + "num_tokens": 723766837.0, + "step": 18971 + }, + { + "epoch": 2.4134334054191577, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5681849718093872, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8797438740730286, + "num_tokens": 723804298.0, + "step": 18972 + }, + { + "epoch": 2.4135606156977483, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4909415245056152, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8775798082351685, + "num_tokens": 723848859.0, + "step": 18973 + }, + { + "epoch": 2.413687825976339, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.752376914024353, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.885783314704895, + "num_tokens": 723878953.0, + "step": 18974 + }, + { + "epoch": 2.4138150362549293, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.616186499595642, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8686625957489014, + "num_tokens": 723918865.0, + "step": 18975 + }, + { + "epoch": 2.41394224653352, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.3767799139022827, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8881339430809021, + "num_tokens": 723965067.0, + "step": 18976 + }, + { + "epoch": 2.4140694568121104, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6343228816986084, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8793454766273499, + "num_tokens": 724003719.0, + "step": 18977 + }, + { + "epoch": 2.414196667090701, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.837496280670166, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8808731436729431, + "num_tokens": 724036026.0, + "step": 18978 + }, + { + "epoch": 2.4143238773692914, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6188721656799316, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.8859632015228271, + "num_tokens": 724071632.0, + "step": 18979 + }, + { + "epoch": 2.414451087647882, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5683070421218872, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8663233518600464, + "num_tokens": 724115535.0, + "step": 18980 + }, + { + "epoch": 2.4145782979264725, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6398009061813354, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8856644034385681, + "num_tokens": 724154515.0, + "step": 18981 + }, + { + "epoch": 2.414705508205063, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7783502340316772, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8794152736663818, + "num_tokens": 724192612.0, + "step": 18982 + }, + { + "epoch": 2.4148327184836536, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8154270648956299, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.874120831489563, + "num_tokens": 724227469.0, + "step": 18983 + }, + { + "epoch": 2.414959928762244, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6009811162948608, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8751205205917358, + "num_tokens": 724265662.0, + "step": 18984 + }, + { + "epoch": 2.4150871390408346, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5654149055480957, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8712471723556519, + "num_tokens": 724308461.0, + "step": 18985 + }, + { + "epoch": 2.415214349319425, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7140753269195557, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8584588766098022, + "num_tokens": 724346740.0, + "step": 18986 + }, + { + "epoch": 2.4153415595980157, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8777168989181519, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8753100037574768, + "num_tokens": 724377967.0, + "step": 18987 + }, + { + "epoch": 2.415468769876606, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6301592588424683, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8741698861122131, + "num_tokens": 724416703.0, + "step": 18988 + }, + { + "epoch": 2.4155959801551967, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.692544937133789, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8789939880371094, + "num_tokens": 724453570.0, + "step": 18989 + }, + { + "epoch": 2.415723190433787, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7169636487960815, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8915824294090271, + "num_tokens": 724488428.0, + "step": 18990 + }, + { + "epoch": 2.4158504007123778, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5046008825302124, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8723279237747192, + "num_tokens": 724533195.0, + "step": 18991 + }, + { + "epoch": 2.415977610990968, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5434422492980957, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8861492872238159, + "num_tokens": 724573994.0, + "step": 18992 + }, + { + "epoch": 2.4161048212695584, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.532995581626892, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8738341331481934, + "num_tokens": 724616037.0, + "step": 18993 + }, + { + "epoch": 2.416232031548149, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7378114461898804, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8819462060928345, + "num_tokens": 724651481.0, + "step": 18994 + }, + { + "epoch": 2.4163592418267394, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6447972059249878, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8684016466140747, + "num_tokens": 724693819.0, + "step": 18995 + }, + { + "epoch": 2.41648645210533, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5871089696884155, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8811037540435791, + "num_tokens": 724734552.0, + "step": 18996 + }, + { + "epoch": 2.4166136623839205, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6249579191207886, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8640596866607666, + "num_tokens": 724776635.0, + "step": 18997 + }, + { + "epoch": 2.416740872662511, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5953227281570435, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8803598284721375, + "num_tokens": 724813607.0, + "step": 18998 + }, + { + "epoch": 2.4168680829411016, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4625849723815918, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8864909410476685, + "num_tokens": 724855836.0, + "step": 18999 + }, + { + "epoch": 2.416995293219692, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5289549827575684, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8825394511222839, + "num_tokens": 724895705.0, + "step": 19000 + }, + { + "epoch": 2.4171225034982826, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6665607690811157, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8759773373603821, + "num_tokens": 724932879.0, + "step": 19001 + }, + { + "epoch": 2.417249713776873, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.736625075340271, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8681777119636536, + "num_tokens": 724967124.0, + "step": 19002 + }, + { + "epoch": 2.4173769240554637, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6461503505706787, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8669672608375549, + "num_tokens": 725009074.0, + "step": 19003 + }, + { + "epoch": 2.417504134334054, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6477577686309814, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8705893754959106, + "num_tokens": 725047452.0, + "step": 19004 + }, + { + "epoch": 2.4176313446126447, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5733946561813354, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8740381002426147, + "num_tokens": 725087954.0, + "step": 19005 + }, + { + "epoch": 2.4177585548912353, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6913235187530518, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8831205368041992, + "num_tokens": 725121826.0, + "step": 19006 + }, + { + "epoch": 2.417885765169826, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7691673040390015, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8675664663314819, + "num_tokens": 725158748.0, + "step": 19007 + }, + { + "epoch": 2.4180129754484163, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6082133054733276, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8833779096603394, + "num_tokens": 725195691.0, + "step": 19008 + }, + { + "epoch": 2.418140185727007, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6511991024017334, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8767182230949402, + "num_tokens": 725234535.0, + "step": 19009 + }, + { + "epoch": 2.4182673960055974, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6378852128982544, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8584336042404175, + "num_tokens": 725275843.0, + "step": 19010 + }, + { + "epoch": 2.418394606284188, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6403433084487915, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8715670108795166, + "num_tokens": 725315443.0, + "step": 19011 + }, + { + "epoch": 2.4185218165627784, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6523172855377197, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8924775719642639, + "num_tokens": 725349465.0, + "step": 19012 + }, + { + "epoch": 2.418649026841369, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7498347759246826, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.862395167350769, + "num_tokens": 725389349.0, + "step": 19013 + }, + { + "epoch": 2.4187762371199595, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.649876594543457, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8619749546051025, + "num_tokens": 725428531.0, + "step": 19014 + }, + { + "epoch": 2.4189034473985496, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5011719465255737, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8806136846542358, + "num_tokens": 725473924.0, + "step": 19015 + }, + { + "epoch": 2.4190306576771405, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6207071542739868, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8807259798049927, + "num_tokens": 725512629.0, + "step": 19016 + }, + { + "epoch": 2.4191578679557306, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8608596324920654, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8626151084899902, + "num_tokens": 725545905.0, + "step": 19017 + }, + { + "epoch": 2.419285078234321, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5963937044143677, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.866004467010498, + "num_tokens": 725589232.0, + "step": 19018 + }, + { + "epoch": 2.4194122885129117, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7299253940582275, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8878699541091919, + "num_tokens": 725620738.0, + "step": 19019 + }, + { + "epoch": 2.419539498791502, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7054600715637207, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8768619894981384, + "num_tokens": 725658114.0, + "step": 19020 + }, + { + "epoch": 2.4196667090700927, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6629083156585693, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.879851222038269, + "num_tokens": 725696486.0, + "step": 19021 + }, + { + "epoch": 2.4197939193486833, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6718220710754395, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8588443398475647, + "num_tokens": 725734042.0, + "step": 19022 + }, + { + "epoch": 2.419921129627274, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7903701066970825, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8718223571777344, + "num_tokens": 725771632.0, + "step": 19023 + }, + { + "epoch": 2.4200483399058643, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7372795343399048, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8829351663589478, + "num_tokens": 725803464.0, + "step": 19024 + }, + { + "epoch": 2.420175550184455, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.663354516029358, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8748481273651123, + "num_tokens": 725839160.0, + "step": 19025 + }, + { + "epoch": 2.4203027604630454, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5962666273117065, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8797569274902344, + "num_tokens": 725883222.0, + "step": 19026 + }, + { + "epoch": 2.420429970741636, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6339973211288452, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8725801706314087, + "num_tokens": 725922785.0, + "step": 19027 + }, + { + "epoch": 2.4205571810202264, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.65337073802948, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8694556951522827, + "num_tokens": 725963944.0, + "step": 19028 + }, + { + "epoch": 2.420684391298817, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7312229871749878, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8784599900245667, + "num_tokens": 726001537.0, + "step": 19029 + }, + { + "epoch": 2.4208116015774075, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8065444231033325, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8650263547897339, + "num_tokens": 726039806.0, + "step": 19030 + }, + { + "epoch": 2.420938811855998, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6053534746170044, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8646149635314941, + "num_tokens": 726081797.0, + "step": 19031 + }, + { + "epoch": 2.4210660221345885, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5204918384552002, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8868920207023621, + "num_tokens": 726124143.0, + "step": 19032 + }, + { + "epoch": 2.421193232413179, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5528333187103271, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8782129287719727, + "num_tokens": 726165661.0, + "step": 19033 + }, + { + "epoch": 2.4213204426917696, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.9034466743469238, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8790915012359619, + "num_tokens": 726198002.0, + "step": 19034 + }, + { + "epoch": 2.42144765297036, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.666020154953003, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8737014532089233, + "num_tokens": 726239949.0, + "step": 19035 + }, + { + "epoch": 2.4215748632489507, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6609135866165161, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8706996440887451, + "num_tokens": 726278804.0, + "step": 19036 + }, + { + "epoch": 2.421702073527541, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6693817377090454, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8653796911239624, + "num_tokens": 726315753.0, + "step": 19037 + }, + { + "epoch": 2.4218292838061317, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7602044343948364, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8640701770782471, + "num_tokens": 726355776.0, + "step": 19038 + }, + { + "epoch": 2.4219564940847222, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7247422933578491, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8537076711654663, + "num_tokens": 726395788.0, + "step": 19039 + }, + { + "epoch": 2.4220837043633123, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6480860710144043, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.871039092540741, + "num_tokens": 726433120.0, + "step": 19040 + }, + { + "epoch": 2.4222109146419033, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6887962818145752, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8702744245529175, + "num_tokens": 726468308.0, + "step": 19041 + }, + { + "epoch": 2.4223381249204934, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7619472742080688, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8739163875579834, + "num_tokens": 726498825.0, + "step": 19042 + }, + { + "epoch": 2.422465335199084, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5568218231201172, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8797941207885742, + "num_tokens": 726540926.0, + "step": 19043 + }, + { + "epoch": 2.4225925454776744, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5854893922805786, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8873462677001953, + "num_tokens": 726579709.0, + "step": 19044 + }, + { + "epoch": 2.422719755756265, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7902323007583618, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8771541714668274, + "num_tokens": 726614343.0, + "step": 19045 + }, + { + "epoch": 2.4228469660348555, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7765001058578491, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8584458827972412, + "num_tokens": 726649987.0, + "step": 19046 + }, + { + "epoch": 2.422974176313446, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.62284255027771, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8819811940193176, + "num_tokens": 726689976.0, + "step": 19047 + }, + { + "epoch": 2.4231013865920366, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.549375057220459, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8669098615646362, + "num_tokens": 726733853.0, + "step": 19048 + }, + { + "epoch": 2.423228596870627, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.72967529296875, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.87047278881073, + "num_tokens": 726777600.0, + "step": 19049 + }, + { + "epoch": 2.4233558071492176, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6612191200256348, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8832960724830627, + "num_tokens": 726814484.0, + "step": 19050 + }, + { + "epoch": 2.423483017427808, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.669199824333191, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8639962673187256, + "num_tokens": 726855942.0, + "step": 19051 + }, + { + "epoch": 2.4236102277063987, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6339439153671265, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8780809044837952, + "num_tokens": 726892273.0, + "step": 19052 + }, + { + "epoch": 2.423737437984989, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.70845365524292, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8740770816802979, + "num_tokens": 726931323.0, + "step": 19053 + }, + { + "epoch": 2.4238646482635797, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7655736207962036, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8636857271194458, + "num_tokens": 726965909.0, + "step": 19054 + }, + { + "epoch": 2.4239918585421703, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6600091457366943, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8814680576324463, + "num_tokens": 727002986.0, + "step": 19055 + }, + { + "epoch": 2.424119068820761, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6276973485946655, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8831927180290222, + "num_tokens": 727042241.0, + "step": 19056 + }, + { + "epoch": 2.4242462790993513, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7640703916549683, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8571289777755737, + "num_tokens": 727079045.0, + "step": 19057 + }, + { + "epoch": 2.424373489377942, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.790043592453003, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8579491376876831, + "num_tokens": 727114884.0, + "step": 19058 + }, + { + "epoch": 2.4245006996565324, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6018073558807373, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8807054758071899, + "num_tokens": 727154822.0, + "step": 19059 + }, + { + "epoch": 2.424627909935123, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6637415885925293, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8802192807197571, + "num_tokens": 727191764.0, + "step": 19060 + }, + { + "epoch": 2.4247551202137134, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5658591985702515, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8875299692153931, + "num_tokens": 727234875.0, + "step": 19061 + }, + { + "epoch": 2.424882330492304, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5693278312683105, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8808742761611938, + "num_tokens": 727277165.0, + "step": 19062 + }, + { + "epoch": 2.4250095407708945, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.489121675491333, + "learning_rate": 1e-06, + "loss": 0.3326, + "mean_token_accuracy": 0.8920866250991821, + "num_tokens": 727320844.0, + "step": 19063 + }, + { + "epoch": 2.425136751049485, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6003267765045166, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8814542293548584, + "num_tokens": 727359194.0, + "step": 19064 + }, + { + "epoch": 2.425263961328075, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4622453451156616, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8666748404502869, + "num_tokens": 727406747.0, + "step": 19065 + }, + { + "epoch": 2.425391171606666, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6203464269638062, + "learning_rate": 1e-06, + "loss": 0.3186, + "mean_token_accuracy": 0.8975610136985779, + "num_tokens": 727438195.0, + "step": 19066 + }, + { + "epoch": 2.425518381885256, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.649865984916687, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.878176212310791, + "num_tokens": 727477491.0, + "step": 19067 + }, + { + "epoch": 2.4256455921638467, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5248775482177734, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8901183605194092, + "num_tokens": 727514837.0, + "step": 19068 + }, + { + "epoch": 2.425772802442437, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6797806024551392, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8671222925186157, + "num_tokens": 727554903.0, + "step": 19069 + }, + { + "epoch": 2.4259000127210277, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7251875400543213, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8776814937591553, + "num_tokens": 727593421.0, + "step": 19070 + }, + { + "epoch": 2.4260272229996183, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.864522933959961, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8651895523071289, + "num_tokens": 727628861.0, + "step": 19071 + }, + { + "epoch": 2.426154433278209, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7679388523101807, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.883086085319519, + "num_tokens": 727662993.0, + "step": 19072 + }, + { + "epoch": 2.4262816435567993, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7380441427230835, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8735096454620361, + "num_tokens": 727698908.0, + "step": 19073 + }, + { + "epoch": 2.42640885383539, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.710371732711792, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8764119744300842, + "num_tokens": 727739614.0, + "step": 19074 + }, + { + "epoch": 2.4265360641139804, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5260111093521118, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8735756874084473, + "num_tokens": 727782245.0, + "step": 19075 + }, + { + "epoch": 2.426663274392571, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7376844882965088, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8725440502166748, + "num_tokens": 727817652.0, + "step": 19076 + }, + { + "epoch": 2.4267904846711614, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7967162132263184, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8790041208267212, + "num_tokens": 727848396.0, + "step": 19077 + }, + { + "epoch": 2.426917694949752, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.579864501953125, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8763365745544434, + "num_tokens": 727887776.0, + "step": 19078 + }, + { + "epoch": 2.4270449052283425, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.700776219367981, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8762764930725098, + "num_tokens": 727922775.0, + "step": 19079 + }, + { + "epoch": 2.427172115506933, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5820647478103638, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8638705015182495, + "num_tokens": 727971871.0, + "step": 19080 + }, + { + "epoch": 2.4272993257855235, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7290312051773071, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8656909465789795, + "num_tokens": 728011332.0, + "step": 19081 + }, + { + "epoch": 2.427426536064114, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6168389320373535, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8821003437042236, + "num_tokens": 728048864.0, + "step": 19082 + }, + { + "epoch": 2.4275537463427046, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7482787370681763, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8557372689247131, + "num_tokens": 728088044.0, + "step": 19083 + }, + { + "epoch": 2.427680956621295, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7053979635238647, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8639122843742371, + "num_tokens": 728124596.0, + "step": 19084 + }, + { + "epoch": 2.4278081668998857, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4760079383850098, + "learning_rate": 1e-06, + "loss": 0.3283, + "mean_token_accuracy": 0.8940373659133911, + "num_tokens": 728170324.0, + "step": 19085 + }, + { + "epoch": 2.427935377178476, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8022286891937256, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8586975932121277, + "num_tokens": 728204090.0, + "step": 19086 + }, + { + "epoch": 2.4280625874570667, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5460909605026245, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8686539530754089, + "num_tokens": 728244274.0, + "step": 19087 + }, + { + "epoch": 2.428189797735657, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7493966817855835, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8779413104057312, + "num_tokens": 728276759.0, + "step": 19088 + }, + { + "epoch": 2.4283170080142478, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.722778558731079, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8557550311088562, + "num_tokens": 728313855.0, + "step": 19089 + }, + { + "epoch": 2.428444218292838, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7343437671661377, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8823551535606384, + "num_tokens": 728347724.0, + "step": 19090 + }, + { + "epoch": 2.4285714285714284, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4954078197479248, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8804319500923157, + "num_tokens": 728390153.0, + "step": 19091 + }, + { + "epoch": 2.428698638850019, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6810684204101562, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8639904260635376, + "num_tokens": 728431218.0, + "step": 19092 + }, + { + "epoch": 2.4288258491286094, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6093214750289917, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8795450925827026, + "num_tokens": 728469041.0, + "step": 19093 + }, + { + "epoch": 2.4289530594072, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7017706632614136, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8828805685043335, + "num_tokens": 728503246.0, + "step": 19094 + }, + { + "epoch": 2.4290802696857905, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7004250288009644, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8785468339920044, + "num_tokens": 728535918.0, + "step": 19095 + }, + { + "epoch": 2.429207479964381, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5659734010696411, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8860366344451904, + "num_tokens": 728573612.0, + "step": 19096 + }, + { + "epoch": 2.4293346902429716, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7060288190841675, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8848775625228882, + "num_tokens": 728611249.0, + "step": 19097 + }, + { + "epoch": 2.429461900521562, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6877013444900513, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8657965064048767, + "num_tokens": 728650743.0, + "step": 19098 + }, + { + "epoch": 2.4295891108001526, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6947072744369507, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.873842716217041, + "num_tokens": 728687238.0, + "step": 19099 + }, + { + "epoch": 2.429716321078743, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8076820373535156, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8631658554077148, + "num_tokens": 728725915.0, + "step": 19100 + }, + { + "epoch": 2.4298435313573337, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6402983665466309, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8802855014801025, + "num_tokens": 728766980.0, + "step": 19101 + }, + { + "epoch": 2.429970741635924, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7052061557769775, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8745798468589783, + "num_tokens": 728804559.0, + "step": 19102 + }, + { + "epoch": 2.4300979519145147, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5612927675247192, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8784462809562683, + "num_tokens": 728847256.0, + "step": 19103 + }, + { + "epoch": 2.4302251621931052, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6591700315475464, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8787109851837158, + "num_tokens": 728885657.0, + "step": 19104 + }, + { + "epoch": 2.4303523724716958, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5570989847183228, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.873104453086853, + "num_tokens": 728928820.0, + "step": 19105 + }, + { + "epoch": 2.4304795827502863, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6517701148986816, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8799091577529907, + "num_tokens": 728964415.0, + "step": 19106 + }, + { + "epoch": 2.430606793028877, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5118558406829834, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8791258335113525, + "num_tokens": 729005046.0, + "step": 19107 + }, + { + "epoch": 2.4307340033074674, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.656582236289978, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.870857834815979, + "num_tokens": 729041007.0, + "step": 19108 + }, + { + "epoch": 2.430861213586058, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8132286071777344, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8669525384902954, + "num_tokens": 729075897.0, + "step": 19109 + }, + { + "epoch": 2.4309884238646484, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5590875148773193, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8748109340667725, + "num_tokens": 729120270.0, + "step": 19110 + }, + { + "epoch": 2.431115634143239, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8189430236816406, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8625228404998779, + "num_tokens": 729155017.0, + "step": 19111 + }, + { + "epoch": 2.4312428444218295, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5819450616836548, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8870798349380493, + "num_tokens": 729190423.0, + "step": 19112 + }, + { + "epoch": 2.4313700547004196, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5937883853912354, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8842729926109314, + "num_tokens": 729228743.0, + "step": 19113 + }, + { + "epoch": 2.4314972649790105, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8369766473770142, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8598663210868835, + "num_tokens": 729271742.0, + "step": 19114 + }, + { + "epoch": 2.4316244752576006, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5974068641662598, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8594375252723694, + "num_tokens": 729314878.0, + "step": 19115 + }, + { + "epoch": 2.431751685536191, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.795170545578003, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8549492955207825, + "num_tokens": 729348090.0, + "step": 19116 + }, + { + "epoch": 2.4318788958147817, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5566048622131348, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8714709877967834, + "num_tokens": 729388343.0, + "step": 19117 + }, + { + "epoch": 2.432006106093372, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6351889371871948, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8784756660461426, + "num_tokens": 729425782.0, + "step": 19118 + }, + { + "epoch": 2.4321333163719627, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5878063440322876, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8825258612632751, + "num_tokens": 729465840.0, + "step": 19119 + }, + { + "epoch": 2.4322605266505533, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5364872217178345, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8802266120910645, + "num_tokens": 729510598.0, + "step": 19120 + }, + { + "epoch": 2.432387736929144, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6548354625701904, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8618907332420349, + "num_tokens": 729548296.0, + "step": 19121 + }, + { + "epoch": 2.4325149472077343, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5774407386779785, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8683937788009644, + "num_tokens": 729587749.0, + "step": 19122 + }, + { + "epoch": 2.432642157486325, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6981056928634644, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8701643347740173, + "num_tokens": 729625672.0, + "step": 19123 + }, + { + "epoch": 2.4327693677649154, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5243117809295654, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8781661987304688, + "num_tokens": 729663904.0, + "step": 19124 + }, + { + "epoch": 2.432896578043506, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6406434774398804, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8780667185783386, + "num_tokens": 729702415.0, + "step": 19125 + }, + { + "epoch": 2.4330237883220964, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.62442147731781, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8772168755531311, + "num_tokens": 729740514.0, + "step": 19126 + }, + { + "epoch": 2.433150998600687, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7043780088424683, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8714563846588135, + "num_tokens": 729775835.0, + "step": 19127 + }, + { + "epoch": 2.4332782088792775, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7725205421447754, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8681398630142212, + "num_tokens": 729811275.0, + "step": 19128 + }, + { + "epoch": 2.433405419157868, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8295021057128906, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8873444199562073, + "num_tokens": 729841947.0, + "step": 19129 + }, + { + "epoch": 2.4335326294364585, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6807557344436646, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8627016544342041, + "num_tokens": 729881190.0, + "step": 19130 + }, + { + "epoch": 2.433659839715049, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.621117353439331, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8710538744926453, + "num_tokens": 729924517.0, + "step": 19131 + }, + { + "epoch": 2.4337870499936396, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7652043104171753, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8834267854690552, + "num_tokens": 729956905.0, + "step": 19132 + }, + { + "epoch": 2.43391426027223, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5518238544464111, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8643041849136353, + "num_tokens": 729996897.0, + "step": 19133 + }, + { + "epoch": 2.4340414705508207, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.523269534111023, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8835238814353943, + "num_tokens": 730037844.0, + "step": 19134 + }, + { + "epoch": 2.434168680829411, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5945687294006348, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8657465577125549, + "num_tokens": 730077815.0, + "step": 19135 + }, + { + "epoch": 2.4342958911080017, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.782856822013855, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8730154037475586, + "num_tokens": 730116907.0, + "step": 19136 + }, + { + "epoch": 2.4344231013865922, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.69429612159729, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8825730681419373, + "num_tokens": 730153531.0, + "step": 19137 + }, + { + "epoch": 2.4345503116651823, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.8061662912368774, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8739478588104248, + "num_tokens": 730186868.0, + "step": 19138 + }, + { + "epoch": 2.4346775219437733, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.637007236480713, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8713194131851196, + "num_tokens": 730224718.0, + "step": 19139 + }, + { + "epoch": 2.4348047322223634, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6307560205459595, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8773914575576782, + "num_tokens": 730266669.0, + "step": 19140 + }, + { + "epoch": 2.434931942500954, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6940518617630005, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8703874349594116, + "num_tokens": 730300665.0, + "step": 19141 + }, + { + "epoch": 2.4350591527795444, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5412003993988037, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8807312250137329, + "num_tokens": 730339243.0, + "step": 19142 + }, + { + "epoch": 2.435186363058135, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5800590515136719, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.871627926826477, + "num_tokens": 730379996.0, + "step": 19143 + }, + { + "epoch": 2.4353135733367255, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5661386251449585, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.886906623840332, + "num_tokens": 730417287.0, + "step": 19144 + }, + { + "epoch": 2.435440783615316, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5830646753311157, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8689416646957397, + "num_tokens": 730457283.0, + "step": 19145 + }, + { + "epoch": 2.4355679938939065, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.5210808515548706, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8778752088546753, + "num_tokens": 730497816.0, + "step": 19146 + }, + { + "epoch": 2.435695204172497, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6526257991790771, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8739885091781616, + "num_tokens": 730534458.0, + "step": 19147 + }, + { + "epoch": 2.4358224144510876, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6380890607833862, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8831714391708374, + "num_tokens": 730570796.0, + "step": 19148 + }, + { + "epoch": 2.435949624729678, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6199864149093628, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8780443668365479, + "num_tokens": 730608151.0, + "step": 19149 + }, + { + "epoch": 2.4360768350082687, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.4843456745147705, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.896407961845398, + "num_tokens": 730650567.0, + "step": 19150 + }, + { + "epoch": 2.436204045286859, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.494434118270874, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8796796798706055, + "num_tokens": 730692407.0, + "step": 19151 + }, + { + "epoch": 2.4363312555654497, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6218841075897217, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8851133584976196, + "num_tokens": 730732873.0, + "step": 19152 + }, + { + "epoch": 2.4364584658440402, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6522793769836426, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8784340620040894, + "num_tokens": 730771837.0, + "step": 19153 + }, + { + "epoch": 2.4365856761226308, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7102588415145874, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.872164785861969, + "num_tokens": 730810893.0, + "step": 19154 + }, + { + "epoch": 2.4367128864012213, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7114225625991821, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8705041408538818, + "num_tokens": 730852335.0, + "step": 19155 + }, + { + "epoch": 2.436840096679812, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.65280282497406, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8702741861343384, + "num_tokens": 730889306.0, + "step": 19156 + }, + { + "epoch": 2.4369673069584024, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5627669095993042, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8909701108932495, + "num_tokens": 730927561.0, + "step": 19157 + }, + { + "epoch": 2.437094517236993, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5549535751342773, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8844327926635742, + "num_tokens": 730968270.0, + "step": 19158 + }, + { + "epoch": 2.4372217275155834, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5339696407318115, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.875864565372467, + "num_tokens": 731011130.0, + "step": 19159 + }, + { + "epoch": 2.437348937794174, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5826077461242676, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8852788209915161, + "num_tokens": 731051590.0, + "step": 19160 + }, + { + "epoch": 2.4374761480727645, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6668623685836792, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8746028542518616, + "num_tokens": 731090610.0, + "step": 19161 + }, + { + "epoch": 2.437603358351355, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5496827363967896, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8819297552108765, + "num_tokens": 731129374.0, + "step": 19162 + }, + { + "epoch": 2.437730568629945, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.7287424802780151, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8592166900634766, + "num_tokens": 731168417.0, + "step": 19163 + }, + { + "epoch": 2.437857778908536, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6048563718795776, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8811794519424438, + "num_tokens": 731206318.0, + "step": 19164 + }, + { + "epoch": 2.437984989187126, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.72847580909729, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8749578595161438, + "num_tokens": 731246490.0, + "step": 19165 + }, + { + "epoch": 2.4381121994657167, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5680904388427734, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8755860328674316, + "num_tokens": 731289242.0, + "step": 19166 + }, + { + "epoch": 2.438239409744307, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.599255084991455, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8797138333320618, + "num_tokens": 731329918.0, + "step": 19167 + }, + { + "epoch": 2.4383666200228977, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.781558871269226, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8592277765274048, + "num_tokens": 731368985.0, + "step": 19168 + }, + { + "epoch": 2.4384938303014883, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6589211225509644, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8734445571899414, + "num_tokens": 731405801.0, + "step": 19169 + }, + { + "epoch": 2.438621040580079, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.4917924404144287, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.870278537273407, + "num_tokens": 731450265.0, + "step": 19170 + }, + { + "epoch": 2.4387482508586693, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.6744968891143799, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8652734756469727, + "num_tokens": 731486732.0, + "step": 19171 + }, + { + "epoch": 2.43887546113726, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6067602634429932, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8702417016029358, + "num_tokens": 731526590.0, + "step": 19172 + }, + { + "epoch": 2.4390026714158504, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8425939083099365, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8748271465301514, + "num_tokens": 731558290.0, + "step": 19173 + }, + { + "epoch": 2.439129881694441, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6761351823806763, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8817731738090515, + "num_tokens": 731593699.0, + "step": 19174 + }, + { + "epoch": 2.4392570919730314, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6331605911254883, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8845952153205872, + "num_tokens": 731630118.0, + "step": 19175 + }, + { + "epoch": 2.439384302251622, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5742921829223633, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8743955492973328, + "num_tokens": 731667680.0, + "step": 19176 + }, + { + "epoch": 2.4395115125302125, + "ewc_loss": 2.8014183044433594e-05, + "grad_norm": 1.535254716873169, + "learning_rate": 1e-06, + "loss": 0.3107, + "mean_token_accuracy": 0.8983322978019714, + "num_tokens": 731702253.0, + "step": 19177 + }, + { + "epoch": 2.439638722808803, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7303287982940674, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8765543699264526, + "num_tokens": 731742358.0, + "step": 19178 + }, + { + "epoch": 2.4397659330873935, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6139805316925049, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8694072961807251, + "num_tokens": 731781347.0, + "step": 19179 + }, + { + "epoch": 2.439893143365984, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6746606826782227, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.878476083278656, + "num_tokens": 731818439.0, + "step": 19180 + }, + { + "epoch": 2.4400203536445746, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6971224546432495, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8811874389648438, + "num_tokens": 731853407.0, + "step": 19181 + }, + { + "epoch": 2.440147563923165, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6456317901611328, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8865739107131958, + "num_tokens": 731887795.0, + "step": 19182 + }, + { + "epoch": 2.4402747742017556, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6277822256088257, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8823198080062866, + "num_tokens": 731924458.0, + "step": 19183 + }, + { + "epoch": 2.440401984480346, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.66288423538208, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8630266189575195, + "num_tokens": 731959722.0, + "step": 19184 + }, + { + "epoch": 2.4405291947589367, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5783649682998657, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.876041054725647, + "num_tokens": 731999516.0, + "step": 19185 + }, + { + "epoch": 2.440656405037527, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5081323385238647, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.894721508026123, + "num_tokens": 732039557.0, + "step": 19186 + }, + { + "epoch": 2.4407836153161178, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6427667140960693, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8736751079559326, + "num_tokens": 732076828.0, + "step": 19187 + }, + { + "epoch": 2.440910825594708, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8118022680282593, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8500798940658569, + "num_tokens": 732113825.0, + "step": 19188 + }, + { + "epoch": 2.4410380358732984, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7111190557479858, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8891039490699768, + "num_tokens": 732144850.0, + "step": 19189 + }, + { + "epoch": 2.441165246151889, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8002853393554688, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8757864236831665, + "num_tokens": 732175984.0, + "step": 19190 + }, + { + "epoch": 2.4412924564304794, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5895133018493652, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8821635246276855, + "num_tokens": 732210241.0, + "step": 19191 + }, + { + "epoch": 2.44141966670907, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.824069857597351, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.869113564491272, + "num_tokens": 732245711.0, + "step": 19192 + }, + { + "epoch": 2.4415468769876605, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7134215831756592, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8752211332321167, + "num_tokens": 732280856.0, + "step": 19193 + }, + { + "epoch": 2.441674087266251, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.636684536933899, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8713000416755676, + "num_tokens": 732323481.0, + "step": 19194 + }, + { + "epoch": 2.4418012975448415, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5164939165115356, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8746923208236694, + "num_tokens": 732370739.0, + "step": 19195 + }, + { + "epoch": 2.441928507823432, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.570363163948059, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8748992681503296, + "num_tokens": 732413213.0, + "step": 19196 + }, + { + "epoch": 2.4420557181020226, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6056185960769653, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8790871500968933, + "num_tokens": 732448969.0, + "step": 19197 + }, + { + "epoch": 2.442182928380613, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5200309753417969, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8776708841323853, + "num_tokens": 732488831.0, + "step": 19198 + }, + { + "epoch": 2.4423101386592037, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6984978914260864, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8795409202575684, + "num_tokens": 732521764.0, + "step": 19199 + }, + { + "epoch": 2.442437348937794, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8740506172180176, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8811901807785034, + "num_tokens": 732555107.0, + "step": 19200 + }, + { + "epoch": 2.4425645592163847, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6745673418045044, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8878018260002136, + "num_tokens": 732586409.0, + "step": 19201 + }, + { + "epoch": 2.4426917694949752, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7843122482299805, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8779326677322388, + "num_tokens": 732624624.0, + "step": 19202 + }, + { + "epoch": 2.4428189797735658, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6990622282028198, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8781864643096924, + "num_tokens": 732663107.0, + "step": 19203 + }, + { + "epoch": 2.4429461900521563, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.650936484336853, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8813463449478149, + "num_tokens": 732700845.0, + "step": 19204 + }, + { + "epoch": 2.443073400330747, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7790855169296265, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.874620795249939, + "num_tokens": 732732849.0, + "step": 19205 + }, + { + "epoch": 2.4432006106093374, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7167150974273682, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8653266429901123, + "num_tokens": 732772248.0, + "step": 19206 + }, + { + "epoch": 2.443327820887928, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.588059902191162, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8756192326545715, + "num_tokens": 732815132.0, + "step": 19207 + }, + { + "epoch": 2.4434550311665184, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7201038599014282, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.872481107711792, + "num_tokens": 732852456.0, + "step": 19208 + }, + { + "epoch": 2.443582241445109, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6718701124191284, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8761385083198547, + "num_tokens": 732892664.0, + "step": 19209 + }, + { + "epoch": 2.4437094517236995, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7877540588378906, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8637517690658569, + "num_tokens": 732929439.0, + "step": 19210 + }, + { + "epoch": 2.4438366620022896, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6773470640182495, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8665034770965576, + "num_tokens": 732971339.0, + "step": 19211 + }, + { + "epoch": 2.4439638722808805, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7622627019882202, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8658156394958496, + "num_tokens": 733011698.0, + "step": 19212 + }, + { + "epoch": 2.4440910825594706, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5979373455047607, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8793075084686279, + "num_tokens": 733051155.0, + "step": 19213 + }, + { + "epoch": 2.444218292838061, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.766397476196289, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8800328969955444, + "num_tokens": 733085455.0, + "step": 19214 + }, + { + "epoch": 2.4443455031166517, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.803606629371643, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8823603391647339, + "num_tokens": 733116326.0, + "step": 19215 + }, + { + "epoch": 2.444472713395242, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.538971185684204, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8858146667480469, + "num_tokens": 733158152.0, + "step": 19216 + }, + { + "epoch": 2.4445999236738327, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8220770359039307, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8782204389572144, + "num_tokens": 733193957.0, + "step": 19217 + }, + { + "epoch": 2.4447271339524232, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7553976774215698, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8805177211761475, + "num_tokens": 733227878.0, + "step": 19218 + }, + { + "epoch": 2.4448543442310138, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7777870893478394, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8783074617385864, + "num_tokens": 733258731.0, + "step": 19219 + }, + { + "epoch": 2.4449815545096043, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6609464883804321, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.864458441734314, + "num_tokens": 733300348.0, + "step": 19220 + }, + { + "epoch": 2.445108764788195, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7970126867294312, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8695697784423828, + "num_tokens": 733341273.0, + "step": 19221 + }, + { + "epoch": 2.4452359750667854, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8317381143569946, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.86395263671875, + "num_tokens": 733380596.0, + "step": 19222 + }, + { + "epoch": 2.445363185345376, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5628845691680908, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8795714974403381, + "num_tokens": 733420280.0, + "step": 19223 + }, + { + "epoch": 2.4454903956239664, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5928659439086914, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.871320366859436, + "num_tokens": 733460215.0, + "step": 19224 + }, + { + "epoch": 2.445617605902557, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5100854635238647, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8771464824676514, + "num_tokens": 733500819.0, + "step": 19225 + }, + { + "epoch": 2.4457448161811475, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5441161394119263, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.872928261756897, + "num_tokens": 733545776.0, + "step": 19226 + }, + { + "epoch": 2.445872026459738, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.770822525024414, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.875433087348938, + "num_tokens": 733579278.0, + "step": 19227 + }, + { + "epoch": 2.4459992367383285, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6673808097839355, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8703448176383972, + "num_tokens": 733617193.0, + "step": 19228 + }, + { + "epoch": 2.446126447016919, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7058136463165283, + "learning_rate": 1e-06, + "loss": 0.3504, + "mean_token_accuracy": 0.8862685561180115, + "num_tokens": 733650648.0, + "step": 19229 + }, + { + "epoch": 2.4462536572955096, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6538009643554688, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.8811307549476624, + "num_tokens": 733688100.0, + "step": 19230 + }, + { + "epoch": 2.4463808675741, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.801218032836914, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8765697479248047, + "num_tokens": 733719642.0, + "step": 19231 + }, + { + "epoch": 2.4465080778526906, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.9260960817337036, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8856427073478699, + "num_tokens": 733746146.0, + "step": 19232 + }, + { + "epoch": 2.446635288131281, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.751956582069397, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.879723846912384, + "num_tokens": 733779116.0, + "step": 19233 + }, + { + "epoch": 2.4467624984098717, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.65489661693573, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.883331298828125, + "num_tokens": 733816628.0, + "step": 19234 + }, + { + "epoch": 2.4468897086884622, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8199692964553833, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8687770366668701, + "num_tokens": 733851328.0, + "step": 19235 + }, + { + "epoch": 2.4470169189670523, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7132328748703003, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.880276083946228, + "num_tokens": 733886218.0, + "step": 19236 + }, + { + "epoch": 2.4471441292456433, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5977667570114136, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8890543580055237, + "num_tokens": 733925526.0, + "step": 19237 + }, + { + "epoch": 2.4472713395242334, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8077428340911865, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8640506267547607, + "num_tokens": 733962307.0, + "step": 19238 + }, + { + "epoch": 2.447398549802824, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6325169801712036, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8737401962280273, + "num_tokens": 734003669.0, + "step": 19239 + }, + { + "epoch": 2.4475257600814144, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7743556499481201, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8761552572250366, + "num_tokens": 734035501.0, + "step": 19240 + }, + { + "epoch": 2.447652970360005, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7884653806686401, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8957809805870056, + "num_tokens": 734066793.0, + "step": 19241 + }, + { + "epoch": 2.4477801806385955, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7106388807296753, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8652646541595459, + "num_tokens": 734108877.0, + "step": 19242 + }, + { + "epoch": 2.447907390917186, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6629174947738647, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8866801261901855, + "num_tokens": 734145260.0, + "step": 19243 + }, + { + "epoch": 2.4480346011957765, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5292528867721558, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.877460241317749, + "num_tokens": 734185915.0, + "step": 19244 + }, + { + "epoch": 2.448161811474367, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6445012092590332, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8805468678474426, + "num_tokens": 734222425.0, + "step": 19245 + }, + { + "epoch": 2.4482890217529576, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6879947185516357, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8604400753974915, + "num_tokens": 734258146.0, + "step": 19246 + }, + { + "epoch": 2.448416232031548, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.694639801979065, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8569948673248291, + "num_tokens": 734295914.0, + "step": 19247 + }, + { + "epoch": 2.4485434423101387, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6128190755844116, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8785374760627747, + "num_tokens": 734333828.0, + "step": 19248 + }, + { + "epoch": 2.448670652588729, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7542617321014404, + "learning_rate": 1e-06, + "loss": 0.3491, + "mean_token_accuracy": 0.8877212405204773, + "num_tokens": 734367216.0, + "step": 19249 + }, + { + "epoch": 2.4487978628673197, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.715657114982605, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8828845024108887, + "num_tokens": 734407662.0, + "step": 19250 + }, + { + "epoch": 2.4489250731459102, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5661205053329468, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8843411207199097, + "num_tokens": 734446191.0, + "step": 19251 + }, + { + "epoch": 2.4490522834245008, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7256724834442139, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8803337812423706, + "num_tokens": 734484473.0, + "step": 19252 + }, + { + "epoch": 2.4491794937030913, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6951254606246948, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8565755486488342, + "num_tokens": 734527127.0, + "step": 19253 + }, + { + "epoch": 2.449306703981682, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6096843481063843, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.872506856918335, + "num_tokens": 734568786.0, + "step": 19254 + }, + { + "epoch": 2.4494339142602723, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5765728950500488, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.876064658164978, + "num_tokens": 734610718.0, + "step": 19255 + }, + { + "epoch": 2.449561124538863, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6184686422348022, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8833497762680054, + "num_tokens": 734649148.0, + "step": 19256 + }, + { + "epoch": 2.4496883348174534, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.521498680114746, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8870961666107178, + "num_tokens": 734691493.0, + "step": 19257 + }, + { + "epoch": 2.449815545096044, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6224088668823242, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8782190084457397, + "num_tokens": 734728343.0, + "step": 19258 + }, + { + "epoch": 2.4499427553746345, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6883440017700195, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8704847097396851, + "num_tokens": 734763955.0, + "step": 19259 + }, + { + "epoch": 2.450069965653225, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.719376802444458, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8628537058830261, + "num_tokens": 734799629.0, + "step": 19260 + }, + { + "epoch": 2.450197175931815, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6254348754882812, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.876995861530304, + "num_tokens": 734839474.0, + "step": 19261 + }, + { + "epoch": 2.450324386210406, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.749068260192871, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8745927214622498, + "num_tokens": 734873489.0, + "step": 19262 + }, + { + "epoch": 2.450451596488996, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6496741771697998, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8666290044784546, + "num_tokens": 734915677.0, + "step": 19263 + }, + { + "epoch": 2.4505788067675867, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.807910442352295, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8803610801696777, + "num_tokens": 734947672.0, + "step": 19264 + }, + { + "epoch": 2.450706017046177, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.533144474029541, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8832195997238159, + "num_tokens": 734988882.0, + "step": 19265 + }, + { + "epoch": 2.4508332273247677, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6422712802886963, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8872171640396118, + "num_tokens": 735023952.0, + "step": 19266 + }, + { + "epoch": 2.4509604376033582, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6785765886306763, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8726534843444824, + "num_tokens": 735059194.0, + "step": 19267 + }, + { + "epoch": 2.4510876478819488, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6780829429626465, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8777532577514648, + "num_tokens": 735096421.0, + "step": 19268 + }, + { + "epoch": 2.4512148581605393, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5786066055297852, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8714035749435425, + "num_tokens": 735140783.0, + "step": 19269 + }, + { + "epoch": 2.45134206843913, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.78741455078125, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8621518611907959, + "num_tokens": 735173678.0, + "step": 19270 + }, + { + "epoch": 2.4514692787177204, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6363171339035034, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8796485662460327, + "num_tokens": 735208537.0, + "step": 19271 + }, + { + "epoch": 2.451596488996311, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.703178882598877, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.878846287727356, + "num_tokens": 735245129.0, + "step": 19272 + }, + { + "epoch": 2.4517236992749014, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5709002017974854, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8794975876808167, + "num_tokens": 735287357.0, + "step": 19273 + }, + { + "epoch": 2.451850909553492, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6605892181396484, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8813158869743347, + "num_tokens": 735327153.0, + "step": 19274 + }, + { + "epoch": 2.4519781198320825, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.675398349761963, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8589476346969604, + "num_tokens": 735368507.0, + "step": 19275 + }, + { + "epoch": 2.452105330110673, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6556282043457031, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8746182918548584, + "num_tokens": 735410720.0, + "step": 19276 + }, + { + "epoch": 2.4522325403892635, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6081992387771606, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8735025525093079, + "num_tokens": 735447689.0, + "step": 19277 + }, + { + "epoch": 2.452359750667854, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.736595869064331, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8708252906799316, + "num_tokens": 735481161.0, + "step": 19278 + }, + { + "epoch": 2.4524869609464446, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6133590936660767, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8735252618789673, + "num_tokens": 735525508.0, + "step": 19279 + }, + { + "epoch": 2.452614171225035, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7473324537277222, + "learning_rate": 1e-06, + "loss": 0.3266, + "mean_token_accuracy": 0.8913471698760986, + "num_tokens": 735555391.0, + "step": 19280 + }, + { + "epoch": 2.4527413815036256, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6330317258834839, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8723897337913513, + "num_tokens": 735594295.0, + "step": 19281 + }, + { + "epoch": 2.452868591782216, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6326655149459839, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8883121013641357, + "num_tokens": 735632667.0, + "step": 19282 + }, + { + "epoch": 2.4529958020608067, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8190747499465942, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8715426325798035, + "num_tokens": 735663307.0, + "step": 19283 + }, + { + "epoch": 2.453123012339397, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.615923285484314, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8694740533828735, + "num_tokens": 735703379.0, + "step": 19284 + }, + { + "epoch": 2.4532502226179878, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5283762216567993, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8723945021629333, + "num_tokens": 735744223.0, + "step": 19285 + }, + { + "epoch": 2.453377432896578, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5311486721038818, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8884757161140442, + "num_tokens": 735784472.0, + "step": 19286 + }, + { + "epoch": 2.4535046431751684, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6262167692184448, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8575456142425537, + "num_tokens": 735825417.0, + "step": 19287 + }, + { + "epoch": 2.453631853453759, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6098995208740234, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8793655633926392, + "num_tokens": 735864687.0, + "step": 19288 + }, + { + "epoch": 2.4537590637323494, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5942302942276, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8671491146087646, + "num_tokens": 735904102.0, + "step": 19289 + }, + { + "epoch": 2.45388627401094, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5876336097717285, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8842406272888184, + "num_tokens": 735945551.0, + "step": 19290 + }, + { + "epoch": 2.4540134842895305, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7462202310562134, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8652029633522034, + "num_tokens": 735984124.0, + "step": 19291 + }, + { + "epoch": 2.454140694568121, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.606878638267517, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8795340061187744, + "num_tokens": 736023338.0, + "step": 19292 + }, + { + "epoch": 2.4542679048467115, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7391048669815063, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8698366284370422, + "num_tokens": 736055689.0, + "step": 19293 + }, + { + "epoch": 2.454395115125302, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.674095869064331, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8727978467941284, + "num_tokens": 736090015.0, + "step": 19294 + }, + { + "epoch": 2.4545223254038926, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7042531967163086, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8826279640197754, + "num_tokens": 736127428.0, + "step": 19295 + }, + { + "epoch": 2.454649535682483, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6587302684783936, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8554391860961914, + "num_tokens": 736168012.0, + "step": 19296 + }, + { + "epoch": 2.4547767459610736, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7018049955368042, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8471368551254272, + "num_tokens": 736205387.0, + "step": 19297 + }, + { + "epoch": 2.454903956239664, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5821702480316162, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8748394250869751, + "num_tokens": 736242177.0, + "step": 19298 + }, + { + "epoch": 2.4550311665182547, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6894453763961792, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8852040767669678, + "num_tokens": 736278868.0, + "step": 19299 + }, + { + "epoch": 2.4551583767968452, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6294691562652588, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8795490860939026, + "num_tokens": 736315004.0, + "step": 19300 + }, + { + "epoch": 2.4552855870754358, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.580806016921997, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.877056896686554, + "num_tokens": 736356680.0, + "step": 19301 + }, + { + "epoch": 2.4554127973540263, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7540624141693115, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8662176132202148, + "num_tokens": 736392924.0, + "step": 19302 + }, + { + "epoch": 2.455540007632617, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6832499504089355, + "learning_rate": 1e-06, + "loss": 0.3391, + "mean_token_accuracy": 0.8901675343513489, + "num_tokens": 736429818.0, + "step": 19303 + }, + { + "epoch": 2.4556672179112073, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.704949975013733, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8578718304634094, + "num_tokens": 736469092.0, + "step": 19304 + }, + { + "epoch": 2.455794428189798, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7424746751785278, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8700656294822693, + "num_tokens": 736506542.0, + "step": 19305 + }, + { + "epoch": 2.4559216384683884, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6010380983352661, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8704165816307068, + "num_tokens": 736546300.0, + "step": 19306 + }, + { + "epoch": 2.456048848746979, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.669821858406067, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8775079250335693, + "num_tokens": 736585292.0, + "step": 19307 + }, + { + "epoch": 2.4561760590255695, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5756275653839111, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8740773797035217, + "num_tokens": 736626730.0, + "step": 19308 + }, + { + "epoch": 2.4563032693041595, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6345723867416382, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8736177682876587, + "num_tokens": 736663794.0, + "step": 19309 + }, + { + "epoch": 2.4564304795827505, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7364625930786133, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8710306882858276, + "num_tokens": 736698358.0, + "step": 19310 + }, + { + "epoch": 2.4565576898613406, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8494014739990234, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8594596982002258, + "num_tokens": 736735579.0, + "step": 19311 + }, + { + "epoch": 2.456684900139931, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7518970966339111, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8708674907684326, + "num_tokens": 736772856.0, + "step": 19312 + }, + { + "epoch": 2.4568121104185217, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6815989017486572, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8710195422172546, + "num_tokens": 736809794.0, + "step": 19313 + }, + { + "epoch": 2.456939320697112, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6377302408218384, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8843576312065125, + "num_tokens": 736848502.0, + "step": 19314 + }, + { + "epoch": 2.4570665309757027, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5146749019622803, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8799003958702087, + "num_tokens": 736890998.0, + "step": 19315 + }, + { + "epoch": 2.4571937412542932, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5325428247451782, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.888036847114563, + "num_tokens": 736930839.0, + "step": 19316 + }, + { + "epoch": 2.4573209515328838, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.678682804107666, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8839744329452515, + "num_tokens": 736964721.0, + "step": 19317 + }, + { + "epoch": 2.4574481618114743, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5465048551559448, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.883635401725769, + "num_tokens": 737005907.0, + "step": 19318 + }, + { + "epoch": 2.457575372090065, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6125046014785767, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8836733102798462, + "num_tokens": 737047940.0, + "step": 19319 + }, + { + "epoch": 2.4577025823686554, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.555485725402832, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.880807101726532, + "num_tokens": 737088894.0, + "step": 19320 + }, + { + "epoch": 2.457829792647246, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5259796380996704, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8722578287124634, + "num_tokens": 737133128.0, + "step": 19321 + }, + { + "epoch": 2.4579570029258364, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6735566854476929, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8739651441574097, + "num_tokens": 737164542.0, + "step": 19322 + }, + { + "epoch": 2.458084213204427, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6670629978179932, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8749369382858276, + "num_tokens": 737198804.0, + "step": 19323 + }, + { + "epoch": 2.4582114234830175, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5289510488510132, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8824783563613892, + "num_tokens": 737241330.0, + "step": 19324 + }, + { + "epoch": 2.458338633761608, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6645926237106323, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8664711713790894, + "num_tokens": 737279144.0, + "step": 19325 + }, + { + "epoch": 2.4584658440401985, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.888730525970459, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8581550121307373, + "num_tokens": 737312808.0, + "step": 19326 + }, + { + "epoch": 2.458593054318789, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5552771091461182, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8667792081832886, + "num_tokens": 737357034.0, + "step": 19327 + }, + { + "epoch": 2.4587202645973796, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6281768083572388, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8902265429496765, + "num_tokens": 737393966.0, + "step": 19328 + }, + { + "epoch": 2.45884747487597, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6700429916381836, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8754608631134033, + "num_tokens": 737432711.0, + "step": 19329 + }, + { + "epoch": 2.4589746851545606, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.743091344833374, + "learning_rate": 1e-06, + "loss": 0.3124, + "mean_token_accuracy": 0.8961833715438843, + "num_tokens": 737466688.0, + "step": 19330 + }, + { + "epoch": 2.459101895433151, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6956994533538818, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8793493509292603, + "num_tokens": 737500315.0, + "step": 19331 + }, + { + "epoch": 2.4592291057117417, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.611872911453247, + "learning_rate": 1e-06, + "loss": 0.3398, + "mean_token_accuracy": 0.8904168605804443, + "num_tokens": 737535226.0, + "step": 19332 + }, + { + "epoch": 2.459356315990332, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5957894325256348, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8765190839767456, + "num_tokens": 737574822.0, + "step": 19333 + }, + { + "epoch": 2.4594835262689223, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.622098684310913, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8798931837081909, + "num_tokens": 737612286.0, + "step": 19334 + }, + { + "epoch": 2.4596107365475133, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6471792459487915, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8751300573348999, + "num_tokens": 737648674.0, + "step": 19335 + }, + { + "epoch": 2.4597379468261034, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7475354671478271, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8930253982543945, + "num_tokens": 737684554.0, + "step": 19336 + }, + { + "epoch": 2.459865157104694, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7697275876998901, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.875568687915802, + "num_tokens": 737721767.0, + "step": 19337 + }, + { + "epoch": 2.4599923673832844, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7184715270996094, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8792749047279358, + "num_tokens": 737753908.0, + "step": 19338 + }, + { + "epoch": 2.460119577661875, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.797738790512085, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8732994198799133, + "num_tokens": 737786935.0, + "step": 19339 + }, + { + "epoch": 2.4602467879404655, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6265084743499756, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8800114989280701, + "num_tokens": 737824855.0, + "step": 19340 + }, + { + "epoch": 2.460373998219056, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 2.316626787185669, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8785547018051147, + "num_tokens": 737859904.0, + "step": 19341 + }, + { + "epoch": 2.4605012084976465, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.599226951599121, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8701126575469971, + "num_tokens": 737898374.0, + "step": 19342 + }, + { + "epoch": 2.460628418776237, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6710858345031738, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8800355195999146, + "num_tokens": 737934579.0, + "step": 19343 + }, + { + "epoch": 2.4607556290548276, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6148790121078491, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8861089944839478, + "num_tokens": 737973360.0, + "step": 19344 + }, + { + "epoch": 2.460882839333418, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6038520336151123, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8884688019752502, + "num_tokens": 738012479.0, + "step": 19345 + }, + { + "epoch": 2.4610100496120086, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6347755193710327, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.868665874004364, + "num_tokens": 738050211.0, + "step": 19346 + }, + { + "epoch": 2.461137259890599, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6031569242477417, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8789541125297546, + "num_tokens": 738086304.0, + "step": 19347 + }, + { + "epoch": 2.4612644701691897, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.679907202720642, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8791707754135132, + "num_tokens": 738121306.0, + "step": 19348 + }, + { + "epoch": 2.4613916804477802, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.611961841583252, + "learning_rate": 1e-06, + "loss": 0.3356, + "mean_token_accuracy": 0.8886574506759644, + "num_tokens": 738155592.0, + "step": 19349 + }, + { + "epoch": 2.4615188907263708, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7062371969223022, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8624441623687744, + "num_tokens": 738193174.0, + "step": 19350 + }, + { + "epoch": 2.4616461010049613, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.569749355316162, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8823996186256409, + "num_tokens": 738235941.0, + "step": 19351 + }, + { + "epoch": 2.461773311283552, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5101171731948853, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8892372250556946, + "num_tokens": 738275973.0, + "step": 19352 + }, + { + "epoch": 2.4619005215621423, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7371814250946045, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8757937550544739, + "num_tokens": 738311624.0, + "step": 19353 + }, + { + "epoch": 2.462027731840733, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5234379768371582, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8750155568122864, + "num_tokens": 738353099.0, + "step": 19354 + }, + { + "epoch": 2.4621549421193234, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.554200291633606, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8706651329994202, + "num_tokens": 738394600.0, + "step": 19355 + }, + { + "epoch": 2.462282152397914, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 2.29809308052063, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8684301376342773, + "num_tokens": 738432407.0, + "step": 19356 + }, + { + "epoch": 2.4624093626765045, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6732280254364014, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8759936094284058, + "num_tokens": 738469069.0, + "step": 19357 + }, + { + "epoch": 2.462536572955095, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7026073932647705, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8753753900527954, + "num_tokens": 738507515.0, + "step": 19358 + }, + { + "epoch": 2.462663783233685, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6822936534881592, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.878760039806366, + "num_tokens": 738543470.0, + "step": 19359 + }, + { + "epoch": 2.462790993512276, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.622633695602417, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8735730648040771, + "num_tokens": 738585511.0, + "step": 19360 + }, + { + "epoch": 2.462918203790866, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6848026514053345, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.872787356376648, + "num_tokens": 738623286.0, + "step": 19361 + }, + { + "epoch": 2.4630454140694567, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7402819395065308, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8643692135810852, + "num_tokens": 738657743.0, + "step": 19362 + }, + { + "epoch": 2.463172624348047, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.672544240951538, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8660554885864258, + "num_tokens": 738692328.0, + "step": 19363 + }, + { + "epoch": 2.4632998346266377, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5620126724243164, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.8943542242050171, + "num_tokens": 738729408.0, + "step": 19364 + }, + { + "epoch": 2.4634270449052282, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6282175779342651, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8699480295181274, + "num_tokens": 738768073.0, + "step": 19365 + }, + { + "epoch": 2.4635542551838188, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7784099578857422, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.870141327381134, + "num_tokens": 738802029.0, + "step": 19366 + }, + { + "epoch": 2.4636814654624093, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7421789169311523, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8605940937995911, + "num_tokens": 738840250.0, + "step": 19367 + }, + { + "epoch": 2.463808675741, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5980440378189087, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8702974915504456, + "num_tokens": 738882467.0, + "step": 19368 + }, + { + "epoch": 2.4639358860195903, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.646350383758545, + "learning_rate": 1e-06, + "loss": 0.3366, + "mean_token_accuracy": 0.890342116355896, + "num_tokens": 738918260.0, + "step": 19369 + }, + { + "epoch": 2.464063096298181, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8075569868087769, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.853553295135498, + "num_tokens": 738956359.0, + "step": 19370 + }, + { + "epoch": 2.4641903065767714, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6142809391021729, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8909957408905029, + "num_tokens": 738992135.0, + "step": 19371 + }, + { + "epoch": 2.464317516855362, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.567597508430481, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8805864453315735, + "num_tokens": 739036015.0, + "step": 19372 + }, + { + "epoch": 2.4644447271339525, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5132471323013306, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8839621543884277, + "num_tokens": 739078412.0, + "step": 19373 + }, + { + "epoch": 2.464571937412543, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7110912799835205, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8842474222183228, + "num_tokens": 739114100.0, + "step": 19374 + }, + { + "epoch": 2.4646991476911335, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.630308747291565, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.872383713722229, + "num_tokens": 739152396.0, + "step": 19375 + }, + { + "epoch": 2.464826357969724, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.618765950202942, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.870427131652832, + "num_tokens": 739190457.0, + "step": 19376 + }, + { + "epoch": 2.4649535682483146, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6724134683609009, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8749270439147949, + "num_tokens": 739229078.0, + "step": 19377 + }, + { + "epoch": 2.465080778526905, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6894118785858154, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8797250986099243, + "num_tokens": 739263245.0, + "step": 19378 + }, + { + "epoch": 2.4652079888054956, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.600938320159912, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8752703666687012, + "num_tokens": 739303771.0, + "step": 19379 + }, + { + "epoch": 2.465335199084086, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7754658460617065, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.872093677520752, + "num_tokens": 739335388.0, + "step": 19380 + }, + { + "epoch": 2.4654624093626767, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6541359424591064, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8657664060592651, + "num_tokens": 739375890.0, + "step": 19381 + }, + { + "epoch": 2.4655896196412668, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5283344984054565, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8800880312919617, + "num_tokens": 739416264.0, + "step": 19382 + }, + { + "epoch": 2.4657168299198577, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6431325674057007, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8625032305717468, + "num_tokens": 739454600.0, + "step": 19383 + }, + { + "epoch": 2.465844040198448, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8539366722106934, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8582352995872498, + "num_tokens": 739487397.0, + "step": 19384 + }, + { + "epoch": 2.4659712504770384, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6653555631637573, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8706653118133545, + "num_tokens": 739527434.0, + "step": 19385 + }, + { + "epoch": 2.466098460755629, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5551245212554932, + "learning_rate": 1e-06, + "loss": 0.3404, + "mean_token_accuracy": 0.8906188607215881, + "num_tokens": 739567090.0, + "step": 19386 + }, + { + "epoch": 2.4662256710342194, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5780513286590576, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8748021125793457, + "num_tokens": 739606096.0, + "step": 19387 + }, + { + "epoch": 2.46635288131281, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6296157836914062, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8822901248931885, + "num_tokens": 739641779.0, + "step": 19388 + }, + { + "epoch": 2.4664800915914005, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6310975551605225, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8664112091064453, + "num_tokens": 739681991.0, + "step": 19389 + }, + { + "epoch": 2.466607301869991, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6437376737594604, + "learning_rate": 1e-06, + "loss": 0.3569, + "mean_token_accuracy": 0.8840926885604858, + "num_tokens": 739720756.0, + "step": 19390 + }, + { + "epoch": 2.4667345121485815, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6451787948608398, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8734613656997681, + "num_tokens": 739761003.0, + "step": 19391 + }, + { + "epoch": 2.466861722427172, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.644850730895996, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8727575540542603, + "num_tokens": 739800953.0, + "step": 19392 + }, + { + "epoch": 2.4669889327057626, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5657446384429932, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8838080763816833, + "num_tokens": 739840357.0, + "step": 19393 + }, + { + "epoch": 2.467116142984353, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6474660634994507, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8840470314025879, + "num_tokens": 739874367.0, + "step": 19394 + }, + { + "epoch": 2.4672433532629436, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.988613486289978, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8688994646072388, + "num_tokens": 739909281.0, + "step": 19395 + }, + { + "epoch": 2.467370563541534, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5620759725570679, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8732731938362122, + "num_tokens": 739952533.0, + "step": 19396 + }, + { + "epoch": 2.4674977738201247, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7406635284423828, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8785065412521362, + "num_tokens": 739985854.0, + "step": 19397 + }, + { + "epoch": 2.4676249840987152, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7490216493606567, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8560183048248291, + "num_tokens": 740023247.0, + "step": 19398 + }, + { + "epoch": 2.4677521943773058, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5244855880737305, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8659704923629761, + "num_tokens": 740069153.0, + "step": 19399 + }, + { + "epoch": 2.4678794046558963, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.853751301765442, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8820782899856567, + "num_tokens": 740100126.0, + "step": 19400 + }, + { + "epoch": 2.468006614934487, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6214371919631958, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8710198402404785, + "num_tokens": 740140534.0, + "step": 19401 + }, + { + "epoch": 2.4681338252130773, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6041254997253418, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8819894194602966, + "num_tokens": 740177319.0, + "step": 19402 + }, + { + "epoch": 2.468261035491668, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5577203035354614, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8939321041107178, + "num_tokens": 740213171.0, + "step": 19403 + }, + { + "epoch": 2.4683882457702584, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6304931640625, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8782991170883179, + "num_tokens": 740254051.0, + "step": 19404 + }, + { + "epoch": 2.468515456048849, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5676093101501465, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.886398434638977, + "num_tokens": 740294129.0, + "step": 19405 + }, + { + "epoch": 2.4686426663274394, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6136890649795532, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8675596714019775, + "num_tokens": 740337172.0, + "step": 19406 + }, + { + "epoch": 2.4687698766060295, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6922454833984375, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8704330921173096, + "num_tokens": 740373118.0, + "step": 19407 + }, + { + "epoch": 2.4688970868846205, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.624776005744934, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8543674945831299, + "num_tokens": 740416016.0, + "step": 19408 + }, + { + "epoch": 2.4690242971632106, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.688515543937683, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8839659690856934, + "num_tokens": 740453068.0, + "step": 19409 + }, + { + "epoch": 2.469151507441801, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.608729600906372, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8779664635658264, + "num_tokens": 740493806.0, + "step": 19410 + }, + { + "epoch": 2.4692787177203916, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6998273134231567, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8806939125061035, + "num_tokens": 740528380.0, + "step": 19411 + }, + { + "epoch": 2.469405927998982, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7246369123458862, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8743919134140015, + "num_tokens": 740564677.0, + "step": 19412 + }, + { + "epoch": 2.4695331382775727, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.703944444656372, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8769762516021729, + "num_tokens": 740601303.0, + "step": 19413 + }, + { + "epoch": 2.4696603485561632, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6261762380599976, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8660193681716919, + "num_tokens": 740642651.0, + "step": 19414 + }, + { + "epoch": 2.4697875588347538, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.4765561819076538, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8724528551101685, + "num_tokens": 740687731.0, + "step": 19415 + }, + { + "epoch": 2.4699147691133443, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.678754448890686, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8784266710281372, + "num_tokens": 740722310.0, + "step": 19416 + }, + { + "epoch": 2.470041979391935, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.9321593046188354, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8659647703170776, + "num_tokens": 740752001.0, + "step": 19417 + }, + { + "epoch": 2.4701691896705253, + "ewc_loss": 2.8252601623535156e-05, + "grad_norm": 1.782385230064392, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8752486705780029, + "num_tokens": 740783907.0, + "step": 19418 + }, + { + "epoch": 2.470296399949116, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5132158994674683, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8695864677429199, + "num_tokens": 740823625.0, + "step": 19419 + }, + { + "epoch": 2.4704236102277064, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5645745992660522, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8802310824394226, + "num_tokens": 740863214.0, + "step": 19420 + }, + { + "epoch": 2.470550820506297, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.778727412223816, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8598923683166504, + "num_tokens": 740900079.0, + "step": 19421 + }, + { + "epoch": 2.4706780307848875, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6281994581222534, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.866550087928772, + "num_tokens": 740939822.0, + "step": 19422 + }, + { + "epoch": 2.470805241063478, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5945252180099487, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8802515864372253, + "num_tokens": 740978726.0, + "step": 19423 + }, + { + "epoch": 2.4709324513420685, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6690315008163452, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8849022388458252, + "num_tokens": 741015068.0, + "step": 19424 + }, + { + "epoch": 2.471059661620659, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8239574432373047, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8766894340515137, + "num_tokens": 741050146.0, + "step": 19425 + }, + { + "epoch": 2.4711868718992496, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6122276782989502, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8682155013084412, + "num_tokens": 741093327.0, + "step": 19426 + }, + { + "epoch": 2.47131408217784, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.708256721496582, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.876020610332489, + "num_tokens": 741133170.0, + "step": 19427 + }, + { + "epoch": 2.4714412924564306, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6010119915008545, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8822501301765442, + "num_tokens": 741173110.0, + "step": 19428 + }, + { + "epoch": 2.471568502735021, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.632680058479309, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8675820231437683, + "num_tokens": 741213784.0, + "step": 19429 + }, + { + "epoch": 2.4716957130136117, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5606571435928345, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8783286809921265, + "num_tokens": 741254836.0, + "step": 19430 + }, + { + "epoch": 2.471822923292202, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.650397539138794, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.875518798828125, + "num_tokens": 741293322.0, + "step": 19431 + }, + { + "epoch": 2.4719501335707923, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7205562591552734, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8649352788925171, + "num_tokens": 741330185.0, + "step": 19432 + }, + { + "epoch": 2.4720773438493833, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.494400978088379, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8794935345649719, + "num_tokens": 741374178.0, + "step": 19433 + }, + { + "epoch": 2.4722045541279734, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5744292736053467, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8538803458213806, + "num_tokens": 741418379.0, + "step": 19434 + }, + { + "epoch": 2.472331764406564, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.537958025932312, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8716294169425964, + "num_tokens": 741459886.0, + "step": 19435 + }, + { + "epoch": 2.4724589746851544, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6236119270324707, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8752896189689636, + "num_tokens": 741496202.0, + "step": 19436 + }, + { + "epoch": 2.472586184963745, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7387490272521973, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8662803173065186, + "num_tokens": 741536819.0, + "step": 19437 + }, + { + "epoch": 2.4727133952423355, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5995330810546875, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8702965974807739, + "num_tokens": 741578705.0, + "step": 19438 + }, + { + "epoch": 2.472840605520926, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.577196717262268, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8614368438720703, + "num_tokens": 741620140.0, + "step": 19439 + }, + { + "epoch": 2.4729678157995165, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5211966037750244, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8866953253746033, + "num_tokens": 741662544.0, + "step": 19440 + }, + { + "epoch": 2.473095026078107, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6772009134292603, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8697494864463806, + "num_tokens": 741704458.0, + "step": 19441 + }, + { + "epoch": 2.4732222363566976, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.667796015739441, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8874800801277161, + "num_tokens": 741741510.0, + "step": 19442 + }, + { + "epoch": 2.473349446635288, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.8126955032348633, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.877882182598114, + "num_tokens": 741772445.0, + "step": 19443 + }, + { + "epoch": 2.4734766569138786, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6286475658416748, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8736021518707275, + "num_tokens": 741810521.0, + "step": 19444 + }, + { + "epoch": 2.473603867192469, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.578082799911499, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8894084095954895, + "num_tokens": 741851409.0, + "step": 19445 + }, + { + "epoch": 2.4737310774710597, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5063554048538208, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8776649832725525, + "num_tokens": 741894955.0, + "step": 19446 + }, + { + "epoch": 2.47385828774965, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5463579893112183, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8828425407409668, + "num_tokens": 741932622.0, + "step": 19447 + }, + { + "epoch": 2.4739854980282407, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 3.8040523529052734, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8856479525566101, + "num_tokens": 741962187.0, + "step": 19448 + }, + { + "epoch": 2.4741127083068313, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5323944091796875, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8739748001098633, + "num_tokens": 742003073.0, + "step": 19449 + }, + { + "epoch": 2.474239918585422, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.66305410861969, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8614034056663513, + "num_tokens": 742041898.0, + "step": 19450 + }, + { + "epoch": 2.4743671288640123, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.578519582748413, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8866791725158691, + "num_tokens": 742081302.0, + "step": 19451 + }, + { + "epoch": 2.474494339142603, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7135977745056152, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8791325092315674, + "num_tokens": 742114093.0, + "step": 19452 + }, + { + "epoch": 2.4746215494211934, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7418713569641113, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8707540035247803, + "num_tokens": 742148100.0, + "step": 19453 + }, + { + "epoch": 2.474748759699784, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5537887811660767, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8897492289543152, + "num_tokens": 742186232.0, + "step": 19454 + }, + { + "epoch": 2.4748759699783744, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7523993253707886, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8792026042938232, + "num_tokens": 742222923.0, + "step": 19455 + }, + { + "epoch": 2.475003180256965, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.581660509109497, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8797774910926819, + "num_tokens": 742262808.0, + "step": 19456 + }, + { + "epoch": 2.475130390535555, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7456891536712646, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8860535025596619, + "num_tokens": 742297452.0, + "step": 19457 + }, + { + "epoch": 2.475257600814146, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.7572412490844727, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8652210235595703, + "num_tokens": 742336540.0, + "step": 19458 + }, + { + "epoch": 2.475384811092736, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6454646587371826, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8780192136764526, + "num_tokens": 742375090.0, + "step": 19459 + }, + { + "epoch": 2.4755120213713266, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5704880952835083, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.869175910949707, + "num_tokens": 742414591.0, + "step": 19460 + }, + { + "epoch": 2.475639231649917, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6031241416931152, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8666120767593384, + "num_tokens": 742456879.0, + "step": 19461 + }, + { + "epoch": 2.4757664419285077, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.589045763015747, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8769304752349854, + "num_tokens": 742496143.0, + "step": 19462 + }, + { + "epoch": 2.4758936522070982, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5327394008636475, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8811643123626709, + "num_tokens": 742538117.0, + "step": 19463 + }, + { + "epoch": 2.4760208624856888, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5780017375946045, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8733064532279968, + "num_tokens": 742575107.0, + "step": 19464 + }, + { + "epoch": 2.4761480727642793, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.488155484199524, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8820947408676147, + "num_tokens": 742617446.0, + "step": 19465 + }, + { + "epoch": 2.47627528304287, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.6066994667053223, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8792660236358643, + "num_tokens": 742657966.0, + "step": 19466 + }, + { + "epoch": 2.4764024933214603, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 1.5430718660354614, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8717440366744995, + "num_tokens": 742699393.0, + "step": 19467 + }, + { + "epoch": 2.476529703600051, + "ewc_loss": 2.8133392333984375e-05, + "grad_norm": 3.7114789485931396, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8793041110038757, + "num_tokens": 742738651.0, + "step": 19468 + }, + { + "epoch": 2.4766569138786414, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6050609350204468, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.875280499458313, + "num_tokens": 742779345.0, + "step": 19469 + }, + { + "epoch": 2.476784124157232, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6702017784118652, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8728427886962891, + "num_tokens": 742816911.0, + "step": 19470 + }, + { + "epoch": 2.4769113344358225, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7203634977340698, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8836523294448853, + "num_tokens": 742849639.0, + "step": 19471 + }, + { + "epoch": 2.477038544714413, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.697556972503662, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.879115104675293, + "num_tokens": 742887229.0, + "step": 19472 + }, + { + "epoch": 2.4771657549930035, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6432563066482544, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.881831705570221, + "num_tokens": 742924606.0, + "step": 19473 + }, + { + "epoch": 2.477292965271594, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6420196294784546, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8539645671844482, + "num_tokens": 742966374.0, + "step": 19474 + }, + { + "epoch": 2.4774201755501846, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.646038293838501, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8877135515213013, + "num_tokens": 743003760.0, + "step": 19475 + }, + { + "epoch": 2.477547385828775, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.83102285861969, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8714876174926758, + "num_tokens": 743035220.0, + "step": 19476 + }, + { + "epoch": 2.4776745961073656, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5410826206207275, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8772318363189697, + "num_tokens": 743078607.0, + "step": 19477 + }, + { + "epoch": 2.477801806385956, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6097334623336792, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8683311343193054, + "num_tokens": 743121786.0, + "step": 19478 + }, + { + "epoch": 2.4779290166645467, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.756847858428955, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8594101667404175, + "num_tokens": 743157030.0, + "step": 19479 + }, + { + "epoch": 2.4780562269431368, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7941179275512695, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.880122184753418, + "num_tokens": 743190023.0, + "step": 19480 + }, + { + "epoch": 2.4781834372217277, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7615412473678589, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8788701295852661, + "num_tokens": 743221786.0, + "step": 19481 + }, + { + "epoch": 2.478310647500318, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.536011815071106, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8741528987884521, + "num_tokens": 743263276.0, + "step": 19482 + }, + { + "epoch": 2.4784378577789083, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6356642246246338, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8874579668045044, + "num_tokens": 743300447.0, + "step": 19483 + }, + { + "epoch": 2.478565068057499, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.568081259727478, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8844126462936401, + "num_tokens": 743342414.0, + "step": 19484 + }, + { + "epoch": 2.4786922783360894, + "ewc_loss": 2.8252601623535156e-05, + "grad_norm": 1.5637786388397217, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8778855204582214, + "num_tokens": 743382488.0, + "step": 19485 + }, + { + "epoch": 2.47881948861468, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6250452995300293, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8652124404907227, + "num_tokens": 743424638.0, + "step": 19486 + }, + { + "epoch": 2.4789466988932705, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7801650762557983, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8598539233207703, + "num_tokens": 743464760.0, + "step": 19487 + }, + { + "epoch": 2.479073909171861, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.623687505722046, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8718591928482056, + "num_tokens": 743504261.0, + "step": 19488 + }, + { + "epoch": 2.4792011194504515, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7216213941574097, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8707018494606018, + "num_tokens": 743538247.0, + "step": 19489 + }, + { + "epoch": 2.479328329729042, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6997101306915283, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8801059126853943, + "num_tokens": 743571743.0, + "step": 19490 + }, + { + "epoch": 2.4794555400076326, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.77742338180542, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8787374496459961, + "num_tokens": 743605318.0, + "step": 19491 + }, + { + "epoch": 2.479582750286223, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6078424453735352, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8859532475471497, + "num_tokens": 743642637.0, + "step": 19492 + }, + { + "epoch": 2.4797099605648136, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7341262102127075, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8677835464477539, + "num_tokens": 743678694.0, + "step": 19493 + }, + { + "epoch": 2.479837170843404, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5740203857421875, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8817732930183411, + "num_tokens": 743714926.0, + "step": 19494 + }, + { + "epoch": 2.4799643811219947, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5143003463745117, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8726588487625122, + "num_tokens": 743757992.0, + "step": 19495 + }, + { + "epoch": 2.480091591400585, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5203713178634644, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8828041553497314, + "num_tokens": 743799555.0, + "step": 19496 + }, + { + "epoch": 2.4802188016791757, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8857669830322266, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.860343337059021, + "num_tokens": 743835124.0, + "step": 19497 + }, + { + "epoch": 2.4803460119577663, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5920870304107666, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8911375403404236, + "num_tokens": 743873076.0, + "step": 19498 + }, + { + "epoch": 2.480473222236357, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5941636562347412, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8842300176620483, + "num_tokens": 743908423.0, + "step": 19499 + }, + { + "epoch": 2.4806004325149473, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6327447891235352, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8680434823036194, + "num_tokens": 743947484.0, + "step": 19500 + }, + { + "epoch": 2.480727642793538, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.631535530090332, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8826949596405029, + "num_tokens": 743981978.0, + "step": 19501 + }, + { + "epoch": 2.4808548530721284, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5994981527328491, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8767588138580322, + "num_tokens": 744018641.0, + "step": 19502 + }, + { + "epoch": 2.480982063350719, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.677489995956421, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8717186450958252, + "num_tokens": 744057888.0, + "step": 19503 + }, + { + "epoch": 2.4811092736293094, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.636645793914795, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8685121536254883, + "num_tokens": 744095162.0, + "step": 19504 + }, + { + "epoch": 2.4812364839078995, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6566113233566284, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.882433295249939, + "num_tokens": 744130187.0, + "step": 19505 + }, + { + "epoch": 2.4813636941864905, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5816789865493774, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8943268656730652, + "num_tokens": 744164335.0, + "step": 19506 + }, + { + "epoch": 2.4814909044650806, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6939784288406372, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8627661466598511, + "num_tokens": 744204319.0, + "step": 19507 + }, + { + "epoch": 2.481618114743671, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6843464374542236, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8726056814193726, + "num_tokens": 744240508.0, + "step": 19508 + }, + { + "epoch": 2.4817453250222616, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.882171630859375, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8558687567710876, + "num_tokens": 744270728.0, + "step": 19509 + }, + { + "epoch": 2.481872535300852, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7870944738388062, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8942805528640747, + "num_tokens": 744302984.0, + "step": 19510 + }, + { + "epoch": 2.4819997455794427, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6241681575775146, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8522703647613525, + "num_tokens": 744347605.0, + "step": 19511 + }, + { + "epoch": 2.4821269558580332, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5363030433654785, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8673017024993896, + "num_tokens": 744392161.0, + "step": 19512 + }, + { + "epoch": 2.4822541661366238, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.639124870300293, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8764747381210327, + "num_tokens": 744427805.0, + "step": 19513 + }, + { + "epoch": 2.4823813764152143, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.738090991973877, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8793678283691406, + "num_tokens": 744458631.0, + "step": 19514 + }, + { + "epoch": 2.482508586693805, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7173480987548828, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.864599883556366, + "num_tokens": 744493350.0, + "step": 19515 + }, + { + "epoch": 2.4826357969723953, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7253084182739258, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8766897916793823, + "num_tokens": 744527950.0, + "step": 19516 + }, + { + "epoch": 2.482763007250986, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.837899923324585, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8730857372283936, + "num_tokens": 744562100.0, + "step": 19517 + }, + { + "epoch": 2.4828902175295764, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5640109777450562, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8793120384216309, + "num_tokens": 744603451.0, + "step": 19518 + }, + { + "epoch": 2.483017427808167, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6374317407608032, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8766264319419861, + "num_tokens": 744642780.0, + "step": 19519 + }, + { + "epoch": 2.4831446380867574, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5307250022888184, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8912069797515869, + "num_tokens": 744680108.0, + "step": 19520 + }, + { + "epoch": 2.483271848365348, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7284857034683228, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8781374096870422, + "num_tokens": 744713197.0, + "step": 19521 + }, + { + "epoch": 2.4833990586439385, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6643027067184448, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8726825714111328, + "num_tokens": 744751215.0, + "step": 19522 + }, + { + "epoch": 2.483526268922529, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.647064447402954, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.872220516204834, + "num_tokens": 744790379.0, + "step": 19523 + }, + { + "epoch": 2.4836534792011196, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8923397064208984, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8862351775169373, + "num_tokens": 744821884.0, + "step": 19524 + }, + { + "epoch": 2.48378068947971, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8462878465652466, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8723834156990051, + "num_tokens": 744859413.0, + "step": 19525 + }, + { + "epoch": 2.4839078997583006, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7083678245544434, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8805137872695923, + "num_tokens": 744899187.0, + "step": 19526 + }, + { + "epoch": 2.484035110036891, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7206536531448364, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8501854538917542, + "num_tokens": 744940119.0, + "step": 19527 + }, + { + "epoch": 2.4841623203154817, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.85025954246521, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8648456335067749, + "num_tokens": 744972440.0, + "step": 19528 + }, + { + "epoch": 2.484289530594072, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.774054765701294, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8730124235153198, + "num_tokens": 745013230.0, + "step": 19529 + }, + { + "epoch": 2.4844167408726623, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7784467935562134, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8758214116096497, + "num_tokens": 745047338.0, + "step": 19530 + }, + { + "epoch": 2.4845439511512533, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8680949211120605, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8627108335494995, + "num_tokens": 745080862.0, + "step": 19531 + }, + { + "epoch": 2.4846711614298433, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5405329465866089, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8547680974006653, + "num_tokens": 745125364.0, + "step": 19532 + }, + { + "epoch": 2.484798371708434, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 2.1652023792266846, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8562749624252319, + "num_tokens": 745156283.0, + "step": 19533 + }, + { + "epoch": 2.4849255819870244, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7188583612442017, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8661764860153198, + "num_tokens": 745192864.0, + "step": 19534 + }, + { + "epoch": 2.485052792265615, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5367333889007568, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8775917887687683, + "num_tokens": 745232155.0, + "step": 19535 + }, + { + "epoch": 2.4851800025442055, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.612099289894104, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8780850172042847, + "num_tokens": 745274244.0, + "step": 19536 + }, + { + "epoch": 2.485307212822796, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.757690191268921, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8853170871734619, + "num_tokens": 745312417.0, + "step": 19537 + }, + { + "epoch": 2.4854344231013865, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.818687915802002, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8746179342269897, + "num_tokens": 745349895.0, + "step": 19538 + }, + { + "epoch": 2.485561633379977, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8138673305511475, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8604512214660645, + "num_tokens": 745386772.0, + "step": 19539 + }, + { + "epoch": 2.4856888436585676, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7227249145507812, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8795349597930908, + "num_tokens": 745422115.0, + "step": 19540 + }, + { + "epoch": 2.485816053937158, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.605413556098938, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8822453618049622, + "num_tokens": 745463380.0, + "step": 19541 + }, + { + "epoch": 2.4859432642157486, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7406750917434692, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8904030323028564, + "num_tokens": 745500098.0, + "step": 19542 + }, + { + "epoch": 2.486070474494339, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4755610227584839, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8759645223617554, + "num_tokens": 745545743.0, + "step": 19543 + }, + { + "epoch": 2.4861976847729297, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5889073610305786, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8897145390510559, + "num_tokens": 745585095.0, + "step": 19544 + }, + { + "epoch": 2.48632489505152, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6070588827133179, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8769782781600952, + "num_tokens": 745620536.0, + "step": 19545 + }, + { + "epoch": 2.4864521053301107, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6491059064865112, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.884212076663971, + "num_tokens": 745658683.0, + "step": 19546 + }, + { + "epoch": 2.4865793156087013, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6249690055847168, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8650093078613281, + "num_tokens": 745701839.0, + "step": 19547 + }, + { + "epoch": 2.486706525887292, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.798938512802124, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8600303530693054, + "num_tokens": 745735300.0, + "step": 19548 + }, + { + "epoch": 2.4868337361658823, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6011450290679932, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8812543749809265, + "num_tokens": 745774506.0, + "step": 19549 + }, + { + "epoch": 2.486960946444473, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8130676746368408, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8791142106056213, + "num_tokens": 745807539.0, + "step": 19550 + }, + { + "epoch": 2.4870881567230634, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.719600796699524, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8638705015182495, + "num_tokens": 745843151.0, + "step": 19551 + }, + { + "epoch": 2.487215367001654, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5216037034988403, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.868810772895813, + "num_tokens": 745884605.0, + "step": 19552 + }, + { + "epoch": 2.4873425772802444, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5319222211837769, + "learning_rate": 1e-06, + "loss": 0.3147, + "mean_token_accuracy": 0.8992040157318115, + "num_tokens": 745925387.0, + "step": 19553 + }, + { + "epoch": 2.487469787558835, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7352936267852783, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8766778707504272, + "num_tokens": 745962445.0, + "step": 19554 + }, + { + "epoch": 2.487596997837425, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.633410930633545, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8848643898963928, + "num_tokens": 746000519.0, + "step": 19555 + }, + { + "epoch": 2.487724208116016, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6371839046478271, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8704445958137512, + "num_tokens": 746037946.0, + "step": 19556 + }, + { + "epoch": 2.487851418394606, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7635829448699951, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.874229907989502, + "num_tokens": 746071136.0, + "step": 19557 + }, + { + "epoch": 2.4879786286731966, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.541551947593689, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8746329545974731, + "num_tokens": 746114162.0, + "step": 19558 + }, + { + "epoch": 2.488105838951787, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.590103030204773, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8865963220596313, + "num_tokens": 746149960.0, + "step": 19559 + }, + { + "epoch": 2.4882330492303777, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5706459283828735, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8654735088348389, + "num_tokens": 746192120.0, + "step": 19560 + }, + { + "epoch": 2.488360259508968, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7088415622711182, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8570685386657715, + "num_tokens": 746231437.0, + "step": 19561 + }, + { + "epoch": 2.4884874697875587, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6759425401687622, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8727220892906189, + "num_tokens": 746268180.0, + "step": 19562 + }, + { + "epoch": 2.4886146800661493, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.895843744277954, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8570689558982849, + "num_tokens": 746303329.0, + "step": 19563 + }, + { + "epoch": 2.48874189034474, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.587785243988037, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8800472021102905, + "num_tokens": 746343662.0, + "step": 19564 + }, + { + "epoch": 2.4888691006233303, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.823792576789856, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8729854822158813, + "num_tokens": 746375144.0, + "step": 19565 + }, + { + "epoch": 2.488996310901921, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6241440773010254, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.856127142906189, + "num_tokens": 746418775.0, + "step": 19566 + }, + { + "epoch": 2.4891235211805114, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5762126445770264, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8829808235168457, + "num_tokens": 746459700.0, + "step": 19567 + }, + { + "epoch": 2.489250731459102, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6363797187805176, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8880534172058105, + "num_tokens": 746495961.0, + "step": 19568 + }, + { + "epoch": 2.4893779417376924, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.634229302406311, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8898659348487854, + "num_tokens": 746530368.0, + "step": 19569 + }, + { + "epoch": 2.489505152016283, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6715048551559448, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8770517110824585, + "num_tokens": 746569512.0, + "step": 19570 + }, + { + "epoch": 2.4896323622948735, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6486070156097412, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8827996850013733, + "num_tokens": 746604563.0, + "step": 19571 + }, + { + "epoch": 2.489759572573464, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5650980472564697, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8839143514633179, + "num_tokens": 746643210.0, + "step": 19572 + }, + { + "epoch": 2.4898867828520546, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5726884603500366, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8717056512832642, + "num_tokens": 746684014.0, + "step": 19573 + }, + { + "epoch": 2.490013993130645, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7216495275497437, + "learning_rate": 1e-06, + "loss": 0.3352, + "mean_token_accuracy": 0.8894473314285278, + "num_tokens": 746716183.0, + "step": 19574 + }, + { + "epoch": 2.4901412034092356, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5346400737762451, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.876765787601471, + "num_tokens": 746759842.0, + "step": 19575 + }, + { + "epoch": 2.490268413687826, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6578774452209473, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8740141987800598, + "num_tokens": 746794321.0, + "step": 19576 + }, + { + "epoch": 2.4903956239664167, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5945709943771362, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8642510771751404, + "num_tokens": 746837761.0, + "step": 19577 + }, + { + "epoch": 2.4905228342450068, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.599806308746338, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8665937781333923, + "num_tokens": 746879259.0, + "step": 19578 + }, + { + "epoch": 2.4906500445235977, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 2.5192391872406006, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8876999020576477, + "num_tokens": 746911312.0, + "step": 19579 + }, + { + "epoch": 2.490777254802188, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7866709232330322, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8678371906280518, + "num_tokens": 746946034.0, + "step": 19580 + }, + { + "epoch": 2.4909044650807783, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5831772089004517, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8705770373344421, + "num_tokens": 746986323.0, + "step": 19581 + }, + { + "epoch": 2.491031675359369, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6793636083602905, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8731564283370972, + "num_tokens": 747021539.0, + "step": 19582 + }, + { + "epoch": 2.4911588856379594, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.761116623878479, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8769382834434509, + "num_tokens": 747056556.0, + "step": 19583 + }, + { + "epoch": 2.49128609591655, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.467055082321167, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8784520626068115, + "num_tokens": 747098333.0, + "step": 19584 + }, + { + "epoch": 2.4914133061951405, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.673885464668274, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8824619650840759, + "num_tokens": 747134338.0, + "step": 19585 + }, + { + "epoch": 2.491540516473731, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5159778594970703, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.885126531124115, + "num_tokens": 747173166.0, + "step": 19586 + }, + { + "epoch": 2.4916677267523215, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6873393058776855, + "learning_rate": 1e-06, + "loss": 0.3297, + "mean_token_accuracy": 0.8905206918716431, + "num_tokens": 747205769.0, + "step": 19587 + }, + { + "epoch": 2.491794937030912, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.606053113937378, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8726951479911804, + "num_tokens": 747246958.0, + "step": 19588 + }, + { + "epoch": 2.4919221473095026, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5732539892196655, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8742835521697998, + "num_tokens": 747290231.0, + "step": 19589 + }, + { + "epoch": 2.492049357588093, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6307950019836426, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8855684399604797, + "num_tokens": 747324278.0, + "step": 19590 + }, + { + "epoch": 2.4921765678666836, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8222533464431763, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8647267818450928, + "num_tokens": 747356394.0, + "step": 19591 + }, + { + "epoch": 2.492303778145274, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5058132410049438, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8856512308120728, + "num_tokens": 747399196.0, + "step": 19592 + }, + { + "epoch": 2.4924309884238647, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7774876356124878, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8885799646377563, + "num_tokens": 747429029.0, + "step": 19593 + }, + { + "epoch": 2.492558198702455, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6019283533096313, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8735203742980957, + "num_tokens": 747468988.0, + "step": 19594 + }, + { + "epoch": 2.4926854089810457, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5833603143692017, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.889239490032196, + "num_tokens": 747506982.0, + "step": 19595 + }, + { + "epoch": 2.4928126192596363, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7066371440887451, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.868034839630127, + "num_tokens": 747545382.0, + "step": 19596 + }, + { + "epoch": 2.492939829538227, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6416791677474976, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8724560141563416, + "num_tokens": 747584426.0, + "step": 19597 + }, + { + "epoch": 2.4930670398168173, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4611707925796509, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8860440254211426, + "num_tokens": 747627912.0, + "step": 19598 + }, + { + "epoch": 2.493194250095408, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6633155345916748, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8772908449172974, + "num_tokens": 747662969.0, + "step": 19599 + }, + { + "epoch": 2.4933214603739984, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5320556163787842, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8807844519615173, + "num_tokens": 747703657.0, + "step": 19600 + }, + { + "epoch": 2.493448670652589, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6284388303756714, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8794053196907043, + "num_tokens": 747741233.0, + "step": 19601 + }, + { + "epoch": 2.4935758809311794, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5692776441574097, + "learning_rate": 1e-06, + "loss": 0.3328, + "mean_token_accuracy": 0.89090496301651, + "num_tokens": 747778963.0, + "step": 19602 + }, + { + "epoch": 2.4937030912097695, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6606206893920898, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8782497644424438, + "num_tokens": 747820943.0, + "step": 19603 + }, + { + "epoch": 2.4938303014883605, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.720518708229065, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8612605333328247, + "num_tokens": 747858858.0, + "step": 19604 + }, + { + "epoch": 2.4939575117669506, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6869077682495117, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8817881941795349, + "num_tokens": 747891803.0, + "step": 19605 + }, + { + "epoch": 2.494084722045541, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5946301221847534, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8848903775215149, + "num_tokens": 747932736.0, + "step": 19606 + }, + { + "epoch": 2.4942119323241316, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.575777530670166, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8695812821388245, + "num_tokens": 747978810.0, + "step": 19607 + }, + { + "epoch": 2.494339142602722, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.772030234336853, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8617891073226929, + "num_tokens": 748013940.0, + "step": 19608 + }, + { + "epoch": 2.4944663528813127, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7271231412887573, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8679423332214355, + "num_tokens": 748050504.0, + "step": 19609 + }, + { + "epoch": 2.494593563159903, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8019849061965942, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.85626220703125, + "num_tokens": 748087807.0, + "step": 19610 + }, + { + "epoch": 2.4947207734384937, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6498044729232788, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8752207159996033, + "num_tokens": 748125444.0, + "step": 19611 + }, + { + "epoch": 2.4948479837170843, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5534974336624146, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8724257946014404, + "num_tokens": 748168420.0, + "step": 19612 + }, + { + "epoch": 2.494975193995675, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6524765491485596, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8732541799545288, + "num_tokens": 748207914.0, + "step": 19613 + }, + { + "epoch": 2.4951024042742653, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6660100221633911, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.877882719039917, + "num_tokens": 748243148.0, + "step": 19614 + }, + { + "epoch": 2.495229614552856, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7679160833358765, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8727333545684814, + "num_tokens": 748276970.0, + "step": 19615 + }, + { + "epoch": 2.4953568248314464, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.587876558303833, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8790162205696106, + "num_tokens": 748314691.0, + "step": 19616 + }, + { + "epoch": 2.495484035110037, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.507716178894043, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8743083477020264, + "num_tokens": 748356963.0, + "step": 19617 + }, + { + "epoch": 2.4956112453886274, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.733373761177063, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8519944548606873, + "num_tokens": 748392000.0, + "step": 19618 + }, + { + "epoch": 2.495738455667218, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6832746267318726, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8734346628189087, + "num_tokens": 748428435.0, + "step": 19619 + }, + { + "epoch": 2.4958656659458085, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5977617502212524, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8741003274917603, + "num_tokens": 748468799.0, + "step": 19620 + }, + { + "epoch": 2.495992876224399, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8597850799560547, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8616223335266113, + "num_tokens": 748499232.0, + "step": 19621 + }, + { + "epoch": 2.4961200865029896, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.562508225440979, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8820824027061462, + "num_tokens": 748541223.0, + "step": 19622 + }, + { + "epoch": 2.49624729678158, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.686285376548767, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8733991980552673, + "num_tokens": 748582810.0, + "step": 19623 + }, + { + "epoch": 2.4963745070601706, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5462850332260132, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8879607915878296, + "num_tokens": 748622000.0, + "step": 19624 + }, + { + "epoch": 2.496501717338761, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4916728734970093, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8872148990631104, + "num_tokens": 748663820.0, + "step": 19625 + }, + { + "epoch": 2.4966289276173517, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7381680011749268, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8563633561134338, + "num_tokens": 748701180.0, + "step": 19626 + }, + { + "epoch": 2.496756137895942, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4953014850616455, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8833556175231934, + "num_tokens": 748743012.0, + "step": 19627 + }, + { + "epoch": 2.4968833481745323, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.591204047203064, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8744146823883057, + "num_tokens": 748783886.0, + "step": 19628 + }, + { + "epoch": 2.4970105584531233, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7090392112731934, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8600855469703674, + "num_tokens": 748823084.0, + "step": 19629 + }, + { + "epoch": 2.4971377687317133, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6387354135513306, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8801486492156982, + "num_tokens": 748859348.0, + "step": 19630 + }, + { + "epoch": 2.497264979010304, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5744726657867432, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8785057067871094, + "num_tokens": 748898635.0, + "step": 19631 + }, + { + "epoch": 2.4973921892888944, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.568579912185669, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8685097694396973, + "num_tokens": 748938963.0, + "step": 19632 + }, + { + "epoch": 2.497519399567485, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7485229969024658, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8753107190132141, + "num_tokens": 748973048.0, + "step": 19633 + }, + { + "epoch": 2.4976466098460754, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6667193174362183, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8828212022781372, + "num_tokens": 749007899.0, + "step": 19634 + }, + { + "epoch": 2.497773820124666, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6988403797149658, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8764687180519104, + "num_tokens": 749048383.0, + "step": 19635 + }, + { + "epoch": 2.4979010304032565, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6657432317733765, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8831169605255127, + "num_tokens": 749086092.0, + "step": 19636 + }, + { + "epoch": 2.498028240681847, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6756969690322876, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8605616092681885, + "num_tokens": 749128314.0, + "step": 19637 + }, + { + "epoch": 2.4981554509604376, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.573219656944275, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8840527534484863, + "num_tokens": 749169438.0, + "step": 19638 + }, + { + "epoch": 2.498282661239028, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 3.770189046859741, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8670972585678101, + "num_tokens": 749206654.0, + "step": 19639 + }, + { + "epoch": 2.4984098715176186, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7697759866714478, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8682316541671753, + "num_tokens": 749243754.0, + "step": 19640 + }, + { + "epoch": 2.498537081796209, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5604567527770996, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.88780277967453, + "num_tokens": 749283871.0, + "step": 19641 + }, + { + "epoch": 2.4986642920747997, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.678041696548462, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8692535161972046, + "num_tokens": 749318965.0, + "step": 19642 + }, + { + "epoch": 2.49879150235339, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5454604625701904, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8812406659126282, + "num_tokens": 749359382.0, + "step": 19643 + }, + { + "epoch": 2.4989187126319807, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6505731344223022, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8692783117294312, + "num_tokens": 749394323.0, + "step": 19644 + }, + { + "epoch": 2.4990459229105713, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4350327253341675, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8671929836273193, + "num_tokens": 749443460.0, + "step": 19645 + }, + { + "epoch": 2.499173133189162, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6827391386032104, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8787785768508911, + "num_tokens": 749484297.0, + "step": 19646 + }, + { + "epoch": 2.4993003434677523, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6842342615127563, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.867645263671875, + "num_tokens": 749526802.0, + "step": 19647 + }, + { + "epoch": 2.499427553746343, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.598483681678772, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8698616027832031, + "num_tokens": 749568404.0, + "step": 19648 + }, + { + "epoch": 2.4995547640249334, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6095136404037476, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8686439990997314, + "num_tokens": 749612709.0, + "step": 19649 + }, + { + "epoch": 2.499681974303524, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8038583993911743, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8801988363265991, + "num_tokens": 749642965.0, + "step": 19650 + }, + { + "epoch": 2.499809184582114, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6736167669296265, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8796369433403015, + "num_tokens": 749678313.0, + "step": 19651 + }, + { + "epoch": 2.499936394860705, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6039416790008545, + "learning_rate": 1e-06, + "loss": 0.3396, + "mean_token_accuracy": 0.8875342607498169, + "num_tokens": 749714445.0, + "step": 19652 + }, + { + "epoch": 2.500063605139295, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.673561453819275, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8691750168800354, + "num_tokens": 749751948.0, + "step": 19653 + }, + { + "epoch": 2.500190815417886, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6349960565567017, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8753237724304199, + "num_tokens": 749790594.0, + "step": 19654 + }, + { + "epoch": 2.500318025696476, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6078119277954102, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8711016774177551, + "num_tokens": 749829767.0, + "step": 19655 + }, + { + "epoch": 2.5004452359750666, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.622269630432129, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8799152374267578, + "num_tokens": 749864635.0, + "step": 19656 + }, + { + "epoch": 2.500572446253657, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6786625385284424, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8585265874862671, + "num_tokens": 749906932.0, + "step": 19657 + }, + { + "epoch": 2.5006996565322477, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6513208150863647, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8664824962615967, + "num_tokens": 749944714.0, + "step": 19658 + }, + { + "epoch": 2.500826866810838, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5931740999221802, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8830435276031494, + "num_tokens": 749981503.0, + "step": 19659 + }, + { + "epoch": 2.5009540770894287, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6042497158050537, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8616542816162109, + "num_tokens": 750024784.0, + "step": 19660 + }, + { + "epoch": 2.5010812873680193, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5605188608169556, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8849540948867798, + "num_tokens": 750068034.0, + "step": 19661 + }, + { + "epoch": 2.50120849764661, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5685093402862549, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8768981099128723, + "num_tokens": 750108563.0, + "step": 19662 + }, + { + "epoch": 2.5013357079252003, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6753098964691162, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8793337941169739, + "num_tokens": 750142108.0, + "step": 19663 + }, + { + "epoch": 2.501462918203791, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5849111080169678, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.879719614982605, + "num_tokens": 750181344.0, + "step": 19664 + }, + { + "epoch": 2.5015901284823814, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.658110499382019, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8692634701728821, + "num_tokens": 750219485.0, + "step": 19665 + }, + { + "epoch": 2.501717338760972, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5495777130126953, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8754551410675049, + "num_tokens": 750263574.0, + "step": 19666 + }, + { + "epoch": 2.5018445490395624, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5307806730270386, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8880660533905029, + "num_tokens": 750302722.0, + "step": 19667 + }, + { + "epoch": 2.501971759318153, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.584936261177063, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8833016157150269, + "num_tokens": 750341916.0, + "step": 19668 + }, + { + "epoch": 2.5020989695967435, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5296047925949097, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8717178106307983, + "num_tokens": 750382710.0, + "step": 19669 + }, + { + "epoch": 2.502226179875334, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.718900203704834, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8607091903686523, + "num_tokens": 750417060.0, + "step": 19670 + }, + { + "epoch": 2.5023533901539246, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7279325723648071, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8745877146720886, + "num_tokens": 750449317.0, + "step": 19671 + }, + { + "epoch": 2.502480600432515, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6010633707046509, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8827354907989502, + "num_tokens": 750490517.0, + "step": 19672 + }, + { + "epoch": 2.5026078107111056, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5606939792633057, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.874001145362854, + "num_tokens": 750532761.0, + "step": 19673 + }, + { + "epoch": 2.5027350209896957, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5904484987258911, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.863705039024353, + "num_tokens": 750577983.0, + "step": 19674 + }, + { + "epoch": 2.5028622312682867, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7284282445907593, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8721305727958679, + "num_tokens": 750611538.0, + "step": 19675 + }, + { + "epoch": 2.5029894415468767, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5134618282318115, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8769802451133728, + "num_tokens": 750654131.0, + "step": 19676 + }, + { + "epoch": 2.5031166518254677, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5806421041488647, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.885312557220459, + "num_tokens": 750690103.0, + "step": 19677 + }, + { + "epoch": 2.503243862104058, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.655834436416626, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8731108903884888, + "num_tokens": 750727876.0, + "step": 19678 + }, + { + "epoch": 2.5033710723826488, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6515417098999023, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8840914964675903, + "num_tokens": 750764443.0, + "step": 19679 + }, + { + "epoch": 2.503498282661239, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6934590339660645, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8803281784057617, + "num_tokens": 750800046.0, + "step": 19680 + }, + { + "epoch": 2.5036254929398294, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6218068599700928, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8813214302062988, + "num_tokens": 750835948.0, + "step": 19681 + }, + { + "epoch": 2.50375270321842, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6556596755981445, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8699086904525757, + "num_tokens": 750876748.0, + "step": 19682 + }, + { + "epoch": 2.5038799134970104, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7144076824188232, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.86372309923172, + "num_tokens": 750918502.0, + "step": 19683 + }, + { + "epoch": 2.504007123775601, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.789404273033142, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8615688681602478, + "num_tokens": 750954206.0, + "step": 19684 + }, + { + "epoch": 2.5041343340541915, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7290844917297363, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8725993037223816, + "num_tokens": 750992326.0, + "step": 19685 + }, + { + "epoch": 2.504261544332782, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7634849548339844, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8770261406898499, + "num_tokens": 751026392.0, + "step": 19686 + }, + { + "epoch": 2.5043887546113726, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.603908896446228, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8758711814880371, + "num_tokens": 751065764.0, + "step": 19687 + }, + { + "epoch": 2.504515964889963, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7434113025665283, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8924720883369446, + "num_tokens": 751093941.0, + "step": 19688 + }, + { + "epoch": 2.5046431751685536, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6177244186401367, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8803657293319702, + "num_tokens": 751131078.0, + "step": 19689 + }, + { + "epoch": 2.504770385447144, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6108431816101074, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8736433982849121, + "num_tokens": 751171718.0, + "step": 19690 + }, + { + "epoch": 2.5048975957257347, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5962481498718262, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8888612985610962, + "num_tokens": 751206342.0, + "step": 19691 + }, + { + "epoch": 2.505024806004325, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6527140140533447, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8604153394699097, + "num_tokens": 751246560.0, + "step": 19692 + }, + { + "epoch": 2.5051520162829157, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.777901530265808, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8572845458984375, + "num_tokens": 751283133.0, + "step": 19693 + }, + { + "epoch": 2.5052792265615063, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 3.6901423931121826, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8743991255760193, + "num_tokens": 751318073.0, + "step": 19694 + }, + { + "epoch": 2.505406436840097, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.814808964729309, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.85862135887146, + "num_tokens": 751351720.0, + "step": 19695 + }, + { + "epoch": 2.5055336471186873, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8385629653930664, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8790477514266968, + "num_tokens": 751382350.0, + "step": 19696 + }, + { + "epoch": 2.505660857397278, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6987076997756958, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8801437616348267, + "num_tokens": 751417818.0, + "step": 19697 + }, + { + "epoch": 2.5057880676758684, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7133188247680664, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8859951496124268, + "num_tokens": 751451762.0, + "step": 19698 + }, + { + "epoch": 2.5059152779544585, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7206929922103882, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8905994296073914, + "num_tokens": 751481592.0, + "step": 19699 + }, + { + "epoch": 2.5060424882330494, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6464835405349731, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8899636268615723, + "num_tokens": 751515946.0, + "step": 19700 + }, + { + "epoch": 2.5061696985116395, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.65268874168396, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8749021291732788, + "num_tokens": 751553398.0, + "step": 19701 + }, + { + "epoch": 2.5062969087902305, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5496623516082764, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8704497814178467, + "num_tokens": 751594942.0, + "step": 19702 + }, + { + "epoch": 2.5064241190688206, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6948148012161255, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8678710460662842, + "num_tokens": 751636219.0, + "step": 19703 + }, + { + "epoch": 2.5065513293474115, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5345200300216675, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.865470826625824, + "num_tokens": 751680587.0, + "step": 19704 + }, + { + "epoch": 2.5066785396260016, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6205406188964844, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8798580169677734, + "num_tokens": 751717419.0, + "step": 19705 + }, + { + "epoch": 2.506805749904592, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 2.1475133895874023, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8778137564659119, + "num_tokens": 751758720.0, + "step": 19706 + }, + { + "epoch": 2.5069329601831827, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.862596869468689, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8785439729690552, + "num_tokens": 751792260.0, + "step": 19707 + }, + { + "epoch": 2.507060170461773, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8386378288269043, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.860872209072113, + "num_tokens": 751827048.0, + "step": 19708 + }, + { + "epoch": 2.5071873807403637, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6969709396362305, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8813600540161133, + "num_tokens": 751864019.0, + "step": 19709 + }, + { + "epoch": 2.5073145910189543, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.9203031063079834, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.8884704113006592, + "num_tokens": 751892113.0, + "step": 19710 + }, + { + "epoch": 2.507441801297545, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5071269273757935, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8805659413337708, + "num_tokens": 751932726.0, + "step": 19711 + }, + { + "epoch": 2.5075690115761353, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6514315605163574, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8742175102233887, + "num_tokens": 751971679.0, + "step": 19712 + }, + { + "epoch": 2.507696221854726, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6607505083084106, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8661919832229614, + "num_tokens": 752011644.0, + "step": 19713 + }, + { + "epoch": 2.5078234321333164, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.638077735900879, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.894483208656311, + "num_tokens": 752047105.0, + "step": 19714 + }, + { + "epoch": 2.507950642411907, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.654441237449646, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.87103670835495, + "num_tokens": 752087110.0, + "step": 19715 + }, + { + "epoch": 2.5080778526904974, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.574662446975708, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8658331036567688, + "num_tokens": 752127237.0, + "step": 19716 + }, + { + "epoch": 2.508205062969088, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7102662324905396, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8500444889068604, + "num_tokens": 752165981.0, + "step": 19717 + }, + { + "epoch": 2.5083322732476785, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5985052585601807, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8746724128723145, + "num_tokens": 752203627.0, + "step": 19718 + }, + { + "epoch": 2.508459483526269, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6077853441238403, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8640638589859009, + "num_tokens": 752243338.0, + "step": 19719 + }, + { + "epoch": 2.5085866938048595, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6033521890640259, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8696242570877075, + "num_tokens": 752283836.0, + "step": 19720 + }, + { + "epoch": 2.50871390408345, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8313257694244385, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8600267171859741, + "num_tokens": 752316191.0, + "step": 19721 + }, + { + "epoch": 2.5088411143620406, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 3.7095985412597656, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.878440797328949, + "num_tokens": 752355417.0, + "step": 19722 + }, + { + "epoch": 2.508968324640631, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6691211462020874, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8549255132675171, + "num_tokens": 752398340.0, + "step": 19723 + }, + { + "epoch": 2.509095534919221, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7590112686157227, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8775031566619873, + "num_tokens": 752432637.0, + "step": 19724 + }, + { + "epoch": 2.509222745197812, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.652945637702942, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8650234341621399, + "num_tokens": 752471242.0, + "step": 19725 + }, + { + "epoch": 2.5093499554764023, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6592435836791992, + "learning_rate": 1e-06, + "loss": 0.3207, + "mean_token_accuracy": 0.8935997486114502, + "num_tokens": 752506508.0, + "step": 19726 + }, + { + "epoch": 2.5094771657549932, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5757288932800293, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.87178635597229, + "num_tokens": 752549472.0, + "step": 19727 + }, + { + "epoch": 2.5096043760335833, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6890555620193481, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8765280246734619, + "num_tokens": 752585042.0, + "step": 19728 + }, + { + "epoch": 2.5097315863121743, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.702538251876831, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8681383728981018, + "num_tokens": 752623558.0, + "step": 19729 + }, + { + "epoch": 2.5098587965907644, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5932326316833496, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8791718482971191, + "num_tokens": 752666692.0, + "step": 19730 + }, + { + "epoch": 2.509986006869355, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5469648838043213, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.884442925453186, + "num_tokens": 752706304.0, + "step": 19731 + }, + { + "epoch": 2.5101132171479454, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6629966497421265, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8773508071899414, + "num_tokens": 752743634.0, + "step": 19732 + }, + { + "epoch": 2.510240427426536, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8072279691696167, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8798258900642395, + "num_tokens": 752774013.0, + "step": 19733 + }, + { + "epoch": 2.5103676377051265, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5276463031768799, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8699960708618164, + "num_tokens": 752811336.0, + "step": 19734 + }, + { + "epoch": 2.510494847983717, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.605938196182251, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8770864009857178, + "num_tokens": 752850096.0, + "step": 19735 + }, + { + "epoch": 2.5106220582623076, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7161535024642944, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8794029355049133, + "num_tokens": 752888747.0, + "step": 19736 + }, + { + "epoch": 2.510749268540898, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6683698892593384, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8570864796638489, + "num_tokens": 752925385.0, + "step": 19737 + }, + { + "epoch": 2.5108764788194886, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6164460182189941, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8597183227539062, + "num_tokens": 752969638.0, + "step": 19738 + }, + { + "epoch": 2.511003689098079, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6658707857131958, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.876074492931366, + "num_tokens": 753005236.0, + "step": 19739 + }, + { + "epoch": 2.5111308993766697, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4617323875427246, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8922505974769592, + "num_tokens": 753048508.0, + "step": 19740 + }, + { + "epoch": 2.51125810965526, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6966724395751953, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8601185083389282, + "num_tokens": 753089720.0, + "step": 19741 + }, + { + "epoch": 2.5113853199338507, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6123641729354858, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8756800293922424, + "num_tokens": 753128545.0, + "step": 19742 + }, + { + "epoch": 2.5115125302124413, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.672237515449524, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8556504249572754, + "num_tokens": 753170128.0, + "step": 19743 + }, + { + "epoch": 2.511639740491032, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.718098759651184, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.877785861492157, + "num_tokens": 753204972.0, + "step": 19744 + }, + { + "epoch": 2.5117669507696223, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5995014905929565, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8722448348999023, + "num_tokens": 753243405.0, + "step": 19745 + }, + { + "epoch": 2.511894161048213, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8529847860336304, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8606078624725342, + "num_tokens": 753276797.0, + "step": 19746 + }, + { + "epoch": 2.5120213713268034, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6457502841949463, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8894420862197876, + "num_tokens": 753313034.0, + "step": 19747 + }, + { + "epoch": 2.512148581605394, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6890071630477905, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8768913745880127, + "num_tokens": 753352166.0, + "step": 19748 + }, + { + "epoch": 2.512275791883984, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7079988718032837, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8764663934707642, + "num_tokens": 753389153.0, + "step": 19749 + }, + { + "epoch": 2.512403002162575, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6721936464309692, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8886849880218506, + "num_tokens": 753425379.0, + "step": 19750 + }, + { + "epoch": 2.512530212441165, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7165913581848145, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8698707818984985, + "num_tokens": 753465606.0, + "step": 19751 + }, + { + "epoch": 2.512657422719756, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.541934847831726, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8812493085861206, + "num_tokens": 753504987.0, + "step": 19752 + }, + { + "epoch": 2.512784632998346, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4997633695602417, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8750340938568115, + "num_tokens": 753549588.0, + "step": 19753 + }, + { + "epoch": 2.5129118432769366, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6436408758163452, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8777009844779968, + "num_tokens": 753585628.0, + "step": 19754 + }, + { + "epoch": 2.513039053555527, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 3.688905954360962, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8766553401947021, + "num_tokens": 753626200.0, + "step": 19755 + }, + { + "epoch": 2.5131662638341177, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7814894914627075, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.869443953037262, + "num_tokens": 753662035.0, + "step": 19756 + }, + { + "epoch": 2.513293474112708, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.631546139717102, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8684480786323547, + "num_tokens": 753699823.0, + "step": 19757 + }, + { + "epoch": 2.5134206843912987, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7645751237869263, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8653440475463867, + "num_tokens": 753738131.0, + "step": 19758 + }, + { + "epoch": 2.5135478946698893, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.537764072418213, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8799664378166199, + "num_tokens": 753776062.0, + "step": 19759 + }, + { + "epoch": 2.51367510494848, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8126307725906372, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8849594593048096, + "num_tokens": 753807717.0, + "step": 19760 + }, + { + "epoch": 2.5138023152270703, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6383196115493774, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8619890213012695, + "num_tokens": 753847613.0, + "step": 19761 + }, + { + "epoch": 2.513929525505661, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.763289451599121, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.859385073184967, + "num_tokens": 753885578.0, + "step": 19762 + }, + { + "epoch": 2.5140567357842514, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6394925117492676, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8718942999839783, + "num_tokens": 753923122.0, + "step": 19763 + }, + { + "epoch": 2.514183946062842, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5909483432769775, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8717337846755981, + "num_tokens": 753968881.0, + "step": 19764 + }, + { + "epoch": 2.5143111563414324, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6087963581085205, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.860582709312439, + "num_tokens": 754011808.0, + "step": 19765 + }, + { + "epoch": 2.514438366620023, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6457536220550537, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8838506937026978, + "num_tokens": 754045726.0, + "step": 19766 + }, + { + "epoch": 2.5145655768986135, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5147477388381958, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8857114315032959, + "num_tokens": 754085931.0, + "step": 19767 + }, + { + "epoch": 2.514692787177204, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6478440761566162, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8622881174087524, + "num_tokens": 754126140.0, + "step": 19768 + }, + { + "epoch": 2.5148199974557945, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8582165241241455, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8472893834114075, + "num_tokens": 754159992.0, + "step": 19769 + }, + { + "epoch": 2.514947207734385, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7485973834991455, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8866978883743286, + "num_tokens": 754191654.0, + "step": 19770 + }, + { + "epoch": 2.5150744180129756, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6552773714065552, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8843240141868591, + "num_tokens": 754230372.0, + "step": 19771 + }, + { + "epoch": 2.5152016282915657, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.660049319267273, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8707329630851746, + "num_tokens": 754267052.0, + "step": 19772 + }, + { + "epoch": 2.5153288385701567, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.553115725517273, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8801009058952332, + "num_tokens": 754308536.0, + "step": 19773 + }, + { + "epoch": 2.5154560488487467, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.613574743270874, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.879630446434021, + "num_tokens": 754345277.0, + "step": 19774 + }, + { + "epoch": 2.5155832591273377, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6366583108901978, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8798544406890869, + "num_tokens": 754384297.0, + "step": 19775 + }, + { + "epoch": 2.515710469405928, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 16.829801559448242, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.872829258441925, + "num_tokens": 754424021.0, + "step": 19776 + }, + { + "epoch": 2.5158376796845188, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5508257150650024, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8812349438667297, + "num_tokens": 754461549.0, + "step": 19777 + }, + { + "epoch": 2.515964889963109, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5569649934768677, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8839901685714722, + "num_tokens": 754498592.0, + "step": 19778 + }, + { + "epoch": 2.5160921002416994, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5759347677230835, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8756437301635742, + "num_tokens": 754541064.0, + "step": 19779 + }, + { + "epoch": 2.51621931052029, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6430779695510864, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8606405258178711, + "num_tokens": 754581050.0, + "step": 19780 + }, + { + "epoch": 2.5163465207988804, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6887402534484863, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8704602718353271, + "num_tokens": 754617986.0, + "step": 19781 + }, + { + "epoch": 2.516473731077471, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7707128524780273, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8674502372741699, + "num_tokens": 754655235.0, + "step": 19782 + }, + { + "epoch": 2.5166009413560615, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5761293172836304, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.877531111240387, + "num_tokens": 754695794.0, + "step": 19783 + }, + { + "epoch": 2.516728151634652, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7549916505813599, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8814164400100708, + "num_tokens": 754730044.0, + "step": 19784 + }, + { + "epoch": 2.5168553619132426, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5657958984375, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8691786527633667, + "num_tokens": 754771389.0, + "step": 19785 + }, + { + "epoch": 2.516982572191833, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6040931940078735, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8833348751068115, + "num_tokens": 754804896.0, + "step": 19786 + }, + { + "epoch": 2.5171097824704236, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6809405088424683, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8765432238578796, + "num_tokens": 754840349.0, + "step": 19787 + }, + { + "epoch": 2.517236992749014, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6518391370773315, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8845332860946655, + "num_tokens": 754880497.0, + "step": 19788 + }, + { + "epoch": 2.5173642030276047, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6429951190948486, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8811118006706238, + "num_tokens": 754916287.0, + "step": 19789 + }, + { + "epoch": 2.517491413306195, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.590458631515503, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8834877610206604, + "num_tokens": 754951416.0, + "step": 19790 + }, + { + "epoch": 2.5176186235847857, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5398107767105103, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8770778179168701, + "num_tokens": 754995756.0, + "step": 19791 + }, + { + "epoch": 2.5177458338633762, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6823601722717285, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8689616918563843, + "num_tokens": 755034561.0, + "step": 19792 + }, + { + "epoch": 2.5178730441419668, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.549450397491455, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.88552325963974, + "num_tokens": 755074063.0, + "step": 19793 + }, + { + "epoch": 2.5180002544205573, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5214564800262451, + "learning_rate": 1e-06, + "loss": 0.3281, + "mean_token_accuracy": 0.8937963843345642, + "num_tokens": 755111537.0, + "step": 19794 + }, + { + "epoch": 2.518127464699148, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5851291418075562, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.875728189945221, + "num_tokens": 755155411.0, + "step": 19795 + }, + { + "epoch": 2.5182546749777384, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6271005868911743, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.877619743347168, + "num_tokens": 755191841.0, + "step": 19796 + }, + { + "epoch": 2.5183818852563284, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5338945388793945, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8795679807662964, + "num_tokens": 755229881.0, + "step": 19797 + }, + { + "epoch": 2.5185090955349194, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5348697900772095, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8784509897232056, + "num_tokens": 755270933.0, + "step": 19798 + }, + { + "epoch": 2.5186363058135095, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7884963750839233, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8753930926322937, + "num_tokens": 755307788.0, + "step": 19799 + }, + { + "epoch": 2.5187635160921005, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7101788520812988, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8763998746871948, + "num_tokens": 755342515.0, + "step": 19800 + }, + { + "epoch": 2.5188907263706906, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6789432764053345, + "learning_rate": 1e-06, + "loss": 0.3293, + "mean_token_accuracy": 0.8922585844993591, + "num_tokens": 755377286.0, + "step": 19801 + }, + { + "epoch": 2.5190179366492815, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6829200983047485, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.866864800453186, + "num_tokens": 755418695.0, + "step": 19802 + }, + { + "epoch": 2.5191451469278716, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4958854913711548, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.883771538734436, + "num_tokens": 755461720.0, + "step": 19803 + }, + { + "epoch": 2.519272357206462, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6456788778305054, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8694332242012024, + "num_tokens": 755500955.0, + "step": 19804 + }, + { + "epoch": 2.5193995674850527, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4693262577056885, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8710905909538269, + "num_tokens": 755547956.0, + "step": 19805 + }, + { + "epoch": 2.519526777763643, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8701834678649902, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8765432238578796, + "num_tokens": 755576855.0, + "step": 19806 + }, + { + "epoch": 2.5196539880422337, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6136101484298706, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8735405802726746, + "num_tokens": 755616137.0, + "step": 19807 + }, + { + "epoch": 2.5197811983208243, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5392075777053833, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8882898688316345, + "num_tokens": 755655150.0, + "step": 19808 + }, + { + "epoch": 2.519908408599415, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5620760917663574, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8768203854560852, + "num_tokens": 755697725.0, + "step": 19809 + }, + { + "epoch": 2.5200356188780053, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6865365505218506, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8662606477737427, + "num_tokens": 755734172.0, + "step": 19810 + }, + { + "epoch": 2.520162829156596, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.77066171169281, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8718218803405762, + "num_tokens": 755769302.0, + "step": 19811 + }, + { + "epoch": 2.5202900394351864, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6169755458831787, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8722552061080933, + "num_tokens": 755808709.0, + "step": 19812 + }, + { + "epoch": 2.520417249713777, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7521746158599854, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8662124872207642, + "num_tokens": 755847200.0, + "step": 19813 + }, + { + "epoch": 2.5205444599923674, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5609811544418335, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8677831888198853, + "num_tokens": 755892354.0, + "step": 19814 + }, + { + "epoch": 2.520671670270958, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6540757417678833, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8786072731018066, + "num_tokens": 755929583.0, + "step": 19815 + }, + { + "epoch": 2.5207988805495485, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.4859273433685303, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8681405782699585, + "num_tokens": 755972322.0, + "step": 19816 + }, + { + "epoch": 2.520926090828139, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5885071754455566, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8779456615447998, + "num_tokens": 756010300.0, + "step": 19817 + }, + { + "epoch": 2.5210533011067295, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5268303155899048, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8958837985992432, + "num_tokens": 756047476.0, + "step": 19818 + }, + { + "epoch": 2.52118051138532, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5962753295898438, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.888249397277832, + "num_tokens": 756082541.0, + "step": 19819 + }, + { + "epoch": 2.5213077216639106, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.608512043952942, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8798636198043823, + "num_tokens": 756122246.0, + "step": 19820 + }, + { + "epoch": 2.521434931942501, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5449485778808594, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8823131322860718, + "num_tokens": 756162152.0, + "step": 19821 + }, + { + "epoch": 2.521562142221091, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7138822078704834, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8703650236129761, + "num_tokens": 756199410.0, + "step": 19822 + }, + { + "epoch": 2.521689352499682, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7440483570098877, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8740552067756653, + "num_tokens": 756235800.0, + "step": 19823 + }, + { + "epoch": 2.5218165627782723, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5536466836929321, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8847639560699463, + "num_tokens": 756275200.0, + "step": 19824 + }, + { + "epoch": 2.5219437730568632, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.568223237991333, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8691074848175049, + "num_tokens": 756315932.0, + "step": 19825 + }, + { + "epoch": 2.5220709833354533, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5714471340179443, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8664153218269348, + "num_tokens": 756361621.0, + "step": 19826 + }, + { + "epoch": 2.522198193614044, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.9389541149139404, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8726165890693665, + "num_tokens": 756391666.0, + "step": 19827 + }, + { + "epoch": 2.5223254038926344, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6102817058563232, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8720442056655884, + "num_tokens": 756431578.0, + "step": 19828 + }, + { + "epoch": 2.522452614171225, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8453377485275269, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.88483065366745, + "num_tokens": 756463948.0, + "step": 19829 + }, + { + "epoch": 2.5225798244498154, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5853559970855713, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8753443360328674, + "num_tokens": 756507157.0, + "step": 19830 + }, + { + "epoch": 2.522707034728406, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6243855953216553, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.881580650806427, + "num_tokens": 756545858.0, + "step": 19831 + }, + { + "epoch": 2.5228342450069965, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6287422180175781, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8730891942977905, + "num_tokens": 756584489.0, + "step": 19832 + }, + { + "epoch": 2.522961455285587, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.8938535451889038, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8854255676269531, + "num_tokens": 756617218.0, + "step": 19833 + }, + { + "epoch": 2.5230886655641775, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.664746880531311, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8708232641220093, + "num_tokens": 756656805.0, + "step": 19834 + }, + { + "epoch": 2.523215875842768, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7926480770111084, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8716148138046265, + "num_tokens": 756696715.0, + "step": 19835 + }, + { + "epoch": 2.5233430861213586, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6044540405273438, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8819243907928467, + "num_tokens": 756733626.0, + "step": 19836 + }, + { + "epoch": 2.523470296399949, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.833817481994629, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8775022029876709, + "num_tokens": 756764847.0, + "step": 19837 + }, + { + "epoch": 2.5235975066785397, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.670394778251648, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8594588041305542, + "num_tokens": 756808718.0, + "step": 19838 + }, + { + "epoch": 2.52372471695713, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6025773286819458, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8846242427825928, + "num_tokens": 756849099.0, + "step": 19839 + }, + { + "epoch": 2.5238519272357207, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6279675960540771, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8802438974380493, + "num_tokens": 756888652.0, + "step": 19840 + }, + { + "epoch": 2.5239791375143112, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.77859628200531, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8812925815582275, + "num_tokens": 756920512.0, + "step": 19841 + }, + { + "epoch": 2.5241063477929018, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5856608152389526, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8688793182373047, + "num_tokens": 756964525.0, + "step": 19842 + }, + { + "epoch": 2.5242335580714923, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.71036958694458, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8749740123748779, + "num_tokens": 757000700.0, + "step": 19843 + }, + { + "epoch": 2.524360768350083, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6687535047531128, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.873185396194458, + "num_tokens": 757039664.0, + "step": 19844 + }, + { + "epoch": 2.5244879786286734, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7040960788726807, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8617141246795654, + "num_tokens": 757078144.0, + "step": 19845 + }, + { + "epoch": 2.524615188907264, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.595288634300232, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8773844242095947, + "num_tokens": 757114833.0, + "step": 19846 + }, + { + "epoch": 2.524742399185854, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5859568119049072, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8714125156402588, + "num_tokens": 757156051.0, + "step": 19847 + }, + { + "epoch": 2.524869609464445, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6064023971557617, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8788692355155945, + "num_tokens": 757196147.0, + "step": 19848 + }, + { + "epoch": 2.524996819743035, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.786512851715088, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8789781928062439, + "num_tokens": 757228638.0, + "step": 19849 + }, + { + "epoch": 2.525124030021626, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.665765404701233, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8667160868644714, + "num_tokens": 757267444.0, + "step": 19850 + }, + { + "epoch": 2.525251240300216, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6890287399291992, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8845693469047546, + "num_tokens": 757302079.0, + "step": 19851 + }, + { + "epoch": 2.5253784505788066, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5145721435546875, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8835132122039795, + "num_tokens": 757342974.0, + "step": 19852 + }, + { + "epoch": 2.525505660857397, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6815521717071533, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8660700917243958, + "num_tokens": 757379555.0, + "step": 19853 + }, + { + "epoch": 2.5256328711359877, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7991336584091187, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8613809943199158, + "num_tokens": 757415755.0, + "step": 19854 + }, + { + "epoch": 2.525760081414578, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6702958345413208, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8789702653884888, + "num_tokens": 757450213.0, + "step": 19855 + }, + { + "epoch": 2.5258872916931687, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5369993448257446, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8719056844711304, + "num_tokens": 757493484.0, + "step": 19856 + }, + { + "epoch": 2.5260145019717593, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5199475288391113, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8834608793258667, + "num_tokens": 757535895.0, + "step": 19857 + }, + { + "epoch": 2.52614171225035, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.8702301979064941, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8867293000221252, + "num_tokens": 757566984.0, + "step": 19858 + }, + { + "epoch": 2.5262689225289403, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6510215997695923, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8843430280685425, + "num_tokens": 757603707.0, + "step": 19859 + }, + { + "epoch": 2.526396132807531, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5939581394195557, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8766797780990601, + "num_tokens": 757640415.0, + "step": 19860 + }, + { + "epoch": 2.5265233430861214, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.560430645942688, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8820469379425049, + "num_tokens": 757681247.0, + "step": 19861 + }, + { + "epoch": 2.526650553364712, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.7117063999176025, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8742249608039856, + "num_tokens": 757717612.0, + "step": 19862 + }, + { + "epoch": 2.5267777636433024, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.5550881624221802, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8749105334281921, + "num_tokens": 757762314.0, + "step": 19863 + }, + { + "epoch": 2.526904973921893, + "ewc_loss": 2.8371810913085938e-05, + "grad_norm": 1.6405586004257202, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.884556770324707, + "num_tokens": 757800051.0, + "step": 19864 + }, + { + "epoch": 2.5270321842004835, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.687199592590332, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8786718249320984, + "num_tokens": 757838631.0, + "step": 19865 + }, + { + "epoch": 2.527159394479074, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6482672691345215, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8821847438812256, + "num_tokens": 757874761.0, + "step": 19866 + }, + { + "epoch": 2.5272866047576645, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6453624963760376, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8606898784637451, + "num_tokens": 757910756.0, + "step": 19867 + }, + { + "epoch": 2.527413815036255, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.706769347190857, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8628325462341309, + "num_tokens": 757948270.0, + "step": 19868 + }, + { + "epoch": 2.5275410253148456, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6852401494979858, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8830389976501465, + "num_tokens": 757980753.0, + "step": 19869 + }, + { + "epoch": 2.5276682355934357, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6388686895370483, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8751139640808105, + "num_tokens": 758021728.0, + "step": 19870 + }, + { + "epoch": 2.5277954458720266, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6259959936141968, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8741277456283569, + "num_tokens": 758059588.0, + "step": 19871 + }, + { + "epoch": 2.5279226561506167, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5926810503005981, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8819013833999634, + "num_tokens": 758097866.0, + "step": 19872 + }, + { + "epoch": 2.5280498664292077, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5676101446151733, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8901609182357788, + "num_tokens": 758136980.0, + "step": 19873 + }, + { + "epoch": 2.528177076707798, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7531100511550903, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8897128105163574, + "num_tokens": 758167511.0, + "step": 19874 + }, + { + "epoch": 2.5283042869863888, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.753020167350769, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8903542160987854, + "num_tokens": 758202523.0, + "step": 19875 + }, + { + "epoch": 2.528431497264979, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6053190231323242, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8735702037811279, + "num_tokens": 758246441.0, + "step": 19876 + }, + { + "epoch": 2.5285587075435694, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6913543939590454, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8775869011878967, + "num_tokens": 758283934.0, + "step": 19877 + }, + { + "epoch": 2.52868591782216, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7714053392410278, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8724158406257629, + "num_tokens": 758316411.0, + "step": 19878 + }, + { + "epoch": 2.5288131281007504, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6649054288864136, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8749788999557495, + "num_tokens": 758354719.0, + "step": 19879 + }, + { + "epoch": 2.528940338379341, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.54324209690094, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.87990403175354, + "num_tokens": 758393597.0, + "step": 19880 + }, + { + "epoch": 2.5290675486579315, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6955184936523438, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8669321537017822, + "num_tokens": 758429140.0, + "step": 19881 + }, + { + "epoch": 2.529194758936522, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.637741208076477, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8749574422836304, + "num_tokens": 758467097.0, + "step": 19882 + }, + { + "epoch": 2.5293219692151125, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5919939279556274, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8726077079772949, + "num_tokens": 758505156.0, + "step": 19883 + }, + { + "epoch": 2.529449179493703, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6806365251541138, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8743164539337158, + "num_tokens": 758541094.0, + "step": 19884 + }, + { + "epoch": 2.5295763897722936, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4524405002593994, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.881392776966095, + "num_tokens": 758588353.0, + "step": 19885 + }, + { + "epoch": 2.529703600050884, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6451618671417236, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8726488947868347, + "num_tokens": 758629307.0, + "step": 19886 + }, + { + "epoch": 2.5298308103294747, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.506052017211914, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8856961131095886, + "num_tokens": 758672599.0, + "step": 19887 + }, + { + "epoch": 2.529958020608065, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5811316967010498, + "learning_rate": 1e-06, + "loss": 0.3306, + "mean_token_accuracy": 0.8922692537307739, + "num_tokens": 758706200.0, + "step": 19888 + }, + { + "epoch": 2.5300852308866557, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.506289005279541, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8763951659202576, + "num_tokens": 758749084.0, + "step": 19889 + }, + { + "epoch": 2.5302124411652462, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6222617626190186, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.884759783744812, + "num_tokens": 758784354.0, + "step": 19890 + }, + { + "epoch": 2.5303396514438368, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6657774448394775, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8815130591392517, + "num_tokens": 758824458.0, + "step": 19891 + }, + { + "epoch": 2.5304668617224273, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6530568599700928, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8882467150688171, + "num_tokens": 758858304.0, + "step": 19892 + }, + { + "epoch": 2.530594072001018, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.738411545753479, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.864337146282196, + "num_tokens": 758892886.0, + "step": 19893 + }, + { + "epoch": 2.5307212822796084, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6045773029327393, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8636670112609863, + "num_tokens": 758935426.0, + "step": 19894 + }, + { + "epoch": 2.5308484925581984, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5602142810821533, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8735048174858093, + "num_tokens": 758974705.0, + "step": 19895 + }, + { + "epoch": 2.5309757028367894, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6824162006378174, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.87613844871521, + "num_tokens": 759012545.0, + "step": 19896 + }, + { + "epoch": 2.5311029131153795, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7319550514221191, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8699893951416016, + "num_tokens": 759050657.0, + "step": 19897 + }, + { + "epoch": 2.5312301233939705, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7423828840255737, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8761993646621704, + "num_tokens": 759091605.0, + "step": 19898 + }, + { + "epoch": 2.5313573336725606, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5921016931533813, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8702590465545654, + "num_tokens": 759130321.0, + "step": 19899 + }, + { + "epoch": 2.5314845439511515, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5688875913619995, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8723541498184204, + "num_tokens": 759172285.0, + "step": 19900 + }, + { + "epoch": 2.5316117542297416, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4623316526412964, + "learning_rate": 1e-06, + "loss": 0.3358, + "mean_token_accuracy": 0.8911540508270264, + "num_tokens": 759214952.0, + "step": 19901 + }, + { + "epoch": 2.531738964508332, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.589530110359192, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8682321310043335, + "num_tokens": 759255542.0, + "step": 19902 + }, + { + "epoch": 2.5318661747869227, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7773419618606567, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8779410123825073, + "num_tokens": 759291260.0, + "step": 19903 + }, + { + "epoch": 2.531993385065513, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.675412654876709, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8582793474197388, + "num_tokens": 759330316.0, + "step": 19904 + }, + { + "epoch": 2.5321205953441037, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8956102132797241, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8696328997612, + "num_tokens": 759359667.0, + "step": 19905 + }, + { + "epoch": 2.5322478056226942, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.669753074645996, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8775814771652222, + "num_tokens": 759395646.0, + "step": 19906 + }, + { + "epoch": 2.5323750159012848, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8945481777191162, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8516104221343994, + "num_tokens": 759428762.0, + "step": 19907 + }, + { + "epoch": 2.5325022261798753, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7806062698364258, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8678758144378662, + "num_tokens": 759463320.0, + "step": 19908 + }, + { + "epoch": 2.532629436458466, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6323124170303345, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8727447986602783, + "num_tokens": 759501818.0, + "step": 19909 + }, + { + "epoch": 2.5327566467370564, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6233478784561157, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8784376978874207, + "num_tokens": 759542446.0, + "step": 19910 + }, + { + "epoch": 2.532883857015647, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5741002559661865, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8588922023773193, + "num_tokens": 759583148.0, + "step": 19911 + }, + { + "epoch": 2.5330110672942374, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6353727579116821, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8641327619552612, + "num_tokens": 759623594.0, + "step": 19912 + }, + { + "epoch": 2.533138277572828, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.619590163230896, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8703104853630066, + "num_tokens": 759663922.0, + "step": 19913 + }, + { + "epoch": 2.5332654878514185, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7788978815078735, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8690135478973389, + "num_tokens": 759702807.0, + "step": 19914 + }, + { + "epoch": 2.533392698130009, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8961421251296997, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8647332191467285, + "num_tokens": 759731754.0, + "step": 19915 + }, + { + "epoch": 2.5335199084085995, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.644494891166687, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8645753860473633, + "num_tokens": 759769108.0, + "step": 19916 + }, + { + "epoch": 2.53364711868719, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6633411645889282, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8675249814987183, + "num_tokens": 759810582.0, + "step": 19917 + }, + { + "epoch": 2.5337743289657806, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5846567153930664, + "learning_rate": 1e-06, + "loss": 0.3271, + "mean_token_accuracy": 0.8941226005554199, + "num_tokens": 759851129.0, + "step": 19918 + }, + { + "epoch": 2.533901539244371, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6251736879348755, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8712756633758545, + "num_tokens": 759892664.0, + "step": 19919 + }, + { + "epoch": 2.534028749522961, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7277437448501587, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8679851293563843, + "num_tokens": 759931145.0, + "step": 19920 + }, + { + "epoch": 2.534155959801552, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6775238513946533, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8696211576461792, + "num_tokens": 759971262.0, + "step": 19921 + }, + { + "epoch": 2.5342831700801423, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7104012966156006, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8817702531814575, + "num_tokens": 760004262.0, + "step": 19922 + }, + { + "epoch": 2.5344103803587332, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5955791473388672, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8898006677627563, + "num_tokens": 760043575.0, + "step": 19923 + }, + { + "epoch": 2.5345375906373233, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6795601844787598, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8828498125076294, + "num_tokens": 760082196.0, + "step": 19924 + }, + { + "epoch": 2.534664800915914, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6873810291290283, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8779200911521912, + "num_tokens": 760115984.0, + "step": 19925 + }, + { + "epoch": 2.5347920111945044, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6503678560256958, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8739711046218872, + "num_tokens": 760153902.0, + "step": 19926 + }, + { + "epoch": 2.534919221473095, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8257262706756592, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8842190504074097, + "num_tokens": 760182654.0, + "step": 19927 + }, + { + "epoch": 2.5350464317516854, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6936671733856201, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8812633156776428, + "num_tokens": 760216746.0, + "step": 19928 + }, + { + "epoch": 2.535173642030276, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6683975458145142, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8639622330665588, + "num_tokens": 760259023.0, + "step": 19929 + }, + { + "epoch": 2.5353008523088665, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5160210132598877, + "learning_rate": 1e-06, + "loss": 0.3209, + "mean_token_accuracy": 0.8971361517906189, + "num_tokens": 760297646.0, + "step": 19930 + }, + { + "epoch": 2.535428062587457, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5387144088745117, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.8875430822372437, + "num_tokens": 760334298.0, + "step": 19931 + }, + { + "epoch": 2.5355552728660475, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6444576978683472, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8697447776794434, + "num_tokens": 760371228.0, + "step": 19932 + }, + { + "epoch": 2.535682483144638, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7028238773345947, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8725000619888306, + "num_tokens": 760409462.0, + "step": 19933 + }, + { + "epoch": 2.5358096934232286, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.725724458694458, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8624703884124756, + "num_tokens": 760448604.0, + "step": 19934 + }, + { + "epoch": 2.535936903701819, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.479141116142273, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8844481110572815, + "num_tokens": 760490918.0, + "step": 19935 + }, + { + "epoch": 2.5360641139804097, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7211129665374756, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8776470422744751, + "num_tokens": 760530111.0, + "step": 19936 + }, + { + "epoch": 2.536191324259, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5161417722702026, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8848037123680115, + "num_tokens": 760572736.0, + "step": 19937 + }, + { + "epoch": 2.5363185345375907, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7011977434158325, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8869138956069946, + "num_tokens": 760610013.0, + "step": 19938 + }, + { + "epoch": 2.5364457448161812, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5160489082336426, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8844748139381409, + "num_tokens": 760651266.0, + "step": 19939 + }, + { + "epoch": 2.5365729550947718, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.657662034034729, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8683561682701111, + "num_tokens": 760690378.0, + "step": 19940 + }, + { + "epoch": 2.5367001653733623, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.73244309425354, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8672749996185303, + "num_tokens": 760731732.0, + "step": 19941 + }, + { + "epoch": 2.536827375651953, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7922909259796143, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.863425612449646, + "num_tokens": 760771431.0, + "step": 19942 + }, + { + "epoch": 2.5369545859305433, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8492522239685059, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8621549606323242, + "num_tokens": 760804279.0, + "step": 19943 + }, + { + "epoch": 2.537081796209134, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7469669580459595, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8704128265380859, + "num_tokens": 760839951.0, + "step": 19944 + }, + { + "epoch": 2.537209006487724, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7672502994537354, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8657491207122803, + "num_tokens": 760876337.0, + "step": 19945 + }, + { + "epoch": 2.537336216766315, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5199382305145264, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8755026459693909, + "num_tokens": 760918529.0, + "step": 19946 + }, + { + "epoch": 2.537463427044905, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.679650902748108, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8761246204376221, + "num_tokens": 760959211.0, + "step": 19947 + }, + { + "epoch": 2.537590637323496, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6344349384307861, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.870803952217102, + "num_tokens": 760999323.0, + "step": 19948 + }, + { + "epoch": 2.537717847602086, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.701383352279663, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8763595819473267, + "num_tokens": 761033358.0, + "step": 19949 + }, + { + "epoch": 2.5378450578806766, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5529621839523315, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8804159164428711, + "num_tokens": 761077684.0, + "step": 19950 + }, + { + "epoch": 2.537972268159267, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.730863332748413, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8741081953048706, + "num_tokens": 761112127.0, + "step": 19951 + }, + { + "epoch": 2.5380994784378577, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8324319124221802, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8718506097793579, + "num_tokens": 761143655.0, + "step": 19952 + }, + { + "epoch": 2.538226688716448, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7460780143737793, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.876855731010437, + "num_tokens": 761176432.0, + "step": 19953 + }, + { + "epoch": 2.5383538989950387, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7155301570892334, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8796073794364929, + "num_tokens": 761212717.0, + "step": 19954 + }, + { + "epoch": 2.5384811092736292, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4423015117645264, + "learning_rate": 1e-06, + "loss": 0.3406, + "mean_token_accuracy": 0.8905925750732422, + "num_tokens": 761254240.0, + "step": 19955 + }, + { + "epoch": 2.5386083195522198, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5215868949890137, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8794530630111694, + "num_tokens": 761295703.0, + "step": 19956 + }, + { + "epoch": 2.5387355298308103, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7450867891311646, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8664544224739075, + "num_tokens": 761332285.0, + "step": 19957 + }, + { + "epoch": 2.538862740109401, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6732053756713867, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8863520622253418, + "num_tokens": 761366686.0, + "step": 19958 + }, + { + "epoch": 2.5389899503879914, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.696346402168274, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8558734655380249, + "num_tokens": 761404664.0, + "step": 19959 + }, + { + "epoch": 2.539117160666582, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.763536810874939, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8545767664909363, + "num_tokens": 761441595.0, + "step": 19960 + }, + { + "epoch": 2.5392443709451724, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5753474235534668, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8836303949356079, + "num_tokens": 761480730.0, + "step": 19961 + }, + { + "epoch": 2.539371581223763, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7695162296295166, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8459818959236145, + "num_tokens": 761517963.0, + "step": 19962 + }, + { + "epoch": 2.5394987915023535, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7772427797317505, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.865079402923584, + "num_tokens": 761555014.0, + "step": 19963 + }, + { + "epoch": 2.539626001780944, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.71442449092865, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8759623765945435, + "num_tokens": 761594131.0, + "step": 19964 + }, + { + "epoch": 2.5397532120595345, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7860655784606934, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.870540976524353, + "num_tokens": 761630197.0, + "step": 19965 + }, + { + "epoch": 2.539880422338125, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.829164743423462, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8768816590309143, + "num_tokens": 761663044.0, + "step": 19966 + }, + { + "epoch": 2.5400076326167156, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5799376964569092, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.867165207862854, + "num_tokens": 761707374.0, + "step": 19967 + }, + { + "epoch": 2.5401348428953057, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8564302921295166, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8527833223342896, + "num_tokens": 761746308.0, + "step": 19968 + }, + { + "epoch": 2.5402620531738966, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7173537015914917, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8716756105422974, + "num_tokens": 761783826.0, + "step": 19969 + }, + { + "epoch": 2.5403892634524867, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8056107759475708, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8769815564155579, + "num_tokens": 761814801.0, + "step": 19970 + }, + { + "epoch": 2.5405164737310777, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7975013256072998, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8724374771118164, + "num_tokens": 761848046.0, + "step": 19971 + }, + { + "epoch": 2.540643684009668, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7568024396896362, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8715260028839111, + "num_tokens": 761884005.0, + "step": 19972 + }, + { + "epoch": 2.5407708942882588, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6269651651382446, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8773375749588013, + "num_tokens": 761919603.0, + "step": 19973 + }, + { + "epoch": 2.540898104566849, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6741143465042114, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8690251111984253, + "num_tokens": 761956963.0, + "step": 19974 + }, + { + "epoch": 2.5410253148454394, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6665223836898804, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8770205974578857, + "num_tokens": 761993898.0, + "step": 19975 + }, + { + "epoch": 2.54115252512403, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.690956950187683, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8769017457962036, + "num_tokens": 762029836.0, + "step": 19976 + }, + { + "epoch": 2.5412797354026204, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.646325707435608, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8649505376815796, + "num_tokens": 762069210.0, + "step": 19977 + }, + { + "epoch": 2.541406945681211, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.679144024848938, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8852623701095581, + "num_tokens": 762106110.0, + "step": 19978 + }, + { + "epoch": 2.5415341559598015, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6294716596603394, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8821876645088196, + "num_tokens": 762143880.0, + "step": 19979 + }, + { + "epoch": 2.541661366238392, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5395331382751465, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8822264671325684, + "num_tokens": 762183264.0, + "step": 19980 + }, + { + "epoch": 2.5417885765169825, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.513205885887146, + "learning_rate": 1e-06, + "loss": 0.3142, + "mean_token_accuracy": 0.897632360458374, + "num_tokens": 762222405.0, + "step": 19981 + }, + { + "epoch": 2.541915786795573, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5520976781845093, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8713665008544922, + "num_tokens": 762262021.0, + "step": 19982 + }, + { + "epoch": 2.5420429970741636, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5998362302780151, + "learning_rate": 1e-06, + "loss": 0.3089, + "mean_token_accuracy": 0.8979780673980713, + "num_tokens": 762297119.0, + "step": 19983 + }, + { + "epoch": 2.542170207352754, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6543110609054565, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.862118124961853, + "num_tokens": 762338259.0, + "step": 19984 + }, + { + "epoch": 2.5422974176313446, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6437897682189941, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8750967979431152, + "num_tokens": 762377355.0, + "step": 19985 + }, + { + "epoch": 2.542424627909935, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6152443885803223, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8823625445365906, + "num_tokens": 762412590.0, + "step": 19986 + }, + { + "epoch": 2.5425518381885257, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6217169761657715, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8726820349693298, + "num_tokens": 762452357.0, + "step": 19987 + }, + { + "epoch": 2.5426790484671162, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6142570972442627, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8881188631057739, + "num_tokens": 762486683.0, + "step": 19988 + }, + { + "epoch": 2.5428062587457068, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7266526222229004, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8573394417762756, + "num_tokens": 762525990.0, + "step": 19989 + }, + { + "epoch": 2.5429334690242973, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5897639989852905, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8675733804702759, + "num_tokens": 762568638.0, + "step": 19990 + }, + { + "epoch": 2.543060679302888, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6888115406036377, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8850108981132507, + "num_tokens": 762602503.0, + "step": 19991 + }, + { + "epoch": 2.5431878895814783, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5360537767410278, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8757442235946655, + "num_tokens": 762645895.0, + "step": 19992 + }, + { + "epoch": 2.5433150998600684, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7055524587631226, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8790419101715088, + "num_tokens": 762682324.0, + "step": 19993 + }, + { + "epoch": 2.5434423101386594, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7230322360992432, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8641031980514526, + "num_tokens": 762720746.0, + "step": 19994 + }, + { + "epoch": 2.5435695204172495, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8508061170578003, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8611668348312378, + "num_tokens": 762756689.0, + "step": 19995 + }, + { + "epoch": 2.5436967306958405, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8512321710586548, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8678271770477295, + "num_tokens": 762788308.0, + "step": 19996 + }, + { + "epoch": 2.5438239409744305, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7840641736984253, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8725404739379883, + "num_tokens": 762822519.0, + "step": 19997 + }, + { + "epoch": 2.5439511512530215, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.646986961364746, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.886544942855835, + "num_tokens": 762863048.0, + "step": 19998 + }, + { + "epoch": 2.5440783615316116, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6413602828979492, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8704437017440796, + "num_tokens": 762900599.0, + "step": 19999 + }, + { + "epoch": 2.544205571810202, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6779820919036865, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8699583411216736, + "num_tokens": 762939328.0, + "step": 20000 + }, + { + "epoch": 2.5443327820887927, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4929136037826538, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8859381675720215, + "num_tokens": 762983680.0, + "step": 20001 + }, + { + "epoch": 2.544459992367383, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6077252626419067, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8844394087791443, + "num_tokens": 763021736.0, + "step": 20002 + }, + { + "epoch": 2.5445872026459737, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7312902212142944, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8739736080169678, + "num_tokens": 763058555.0, + "step": 20003 + }, + { + "epoch": 2.5447144129245642, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6877433061599731, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8801857829093933, + "num_tokens": 763090622.0, + "step": 20004 + }, + { + "epoch": 2.5448416232031548, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5756642818450928, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8726404905319214, + "num_tokens": 763131046.0, + "step": 20005 + }, + { + "epoch": 2.5449688334817453, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7039000988006592, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.8849074244499207, + "num_tokens": 763162926.0, + "step": 20006 + }, + { + "epoch": 2.545096043760336, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.585426688194275, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8842333555221558, + "num_tokens": 763198326.0, + "step": 20007 + }, + { + "epoch": 2.5452232540389264, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.574668526649475, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8705555200576782, + "num_tokens": 763237144.0, + "step": 20008 + }, + { + "epoch": 2.545350464317517, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6646592617034912, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8820990324020386, + "num_tokens": 763272703.0, + "step": 20009 + }, + { + "epoch": 2.5454776745961074, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5979695320129395, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.866326630115509, + "num_tokens": 763313000.0, + "step": 20010 + }, + { + "epoch": 2.545604884874698, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6842546463012695, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8524783253669739, + "num_tokens": 763351951.0, + "step": 20011 + }, + { + "epoch": 2.5457320951532885, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5608429908752441, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8838979601860046, + "num_tokens": 763389604.0, + "step": 20012 + }, + { + "epoch": 2.545859305431879, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6454263925552368, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8793153762817383, + "num_tokens": 763425996.0, + "step": 20013 + }, + { + "epoch": 2.5459865157104695, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5947123765945435, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.877343475818634, + "num_tokens": 763463191.0, + "step": 20014 + }, + { + "epoch": 2.54611372598906, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6373590230941772, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.869449257850647, + "num_tokens": 763504066.0, + "step": 20015 + }, + { + "epoch": 2.5462409362676506, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5987266302108765, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8567038178443909, + "num_tokens": 763543761.0, + "step": 20016 + }, + { + "epoch": 2.546368146546241, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6521941423416138, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8704187870025635, + "num_tokens": 763583937.0, + "step": 20017 + }, + { + "epoch": 2.546495356824831, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6664345264434814, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8888112902641296, + "num_tokens": 763619212.0, + "step": 20018 + }, + { + "epoch": 2.546622567103422, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6635574102401733, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8752057552337646, + "num_tokens": 763658975.0, + "step": 20019 + }, + { + "epoch": 2.5467497773820122, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6717429161071777, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8685053586959839, + "num_tokens": 763697080.0, + "step": 20020 + }, + { + "epoch": 2.546876987660603, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5013593435287476, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8806741237640381, + "num_tokens": 763738077.0, + "step": 20021 + }, + { + "epoch": 2.5470041979391933, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7232139110565186, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8749535083770752, + "num_tokens": 763773134.0, + "step": 20022 + }, + { + "epoch": 2.547131408217784, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6358627080917358, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8786472678184509, + "num_tokens": 763810882.0, + "step": 20023 + }, + { + "epoch": 2.5472586184963744, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8007562160491943, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8716885447502136, + "num_tokens": 763847556.0, + "step": 20024 + }, + { + "epoch": 2.547385828774965, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7128547430038452, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8828061819076538, + "num_tokens": 763883301.0, + "step": 20025 + }, + { + "epoch": 2.5475130390535554, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7764949798583984, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8631240129470825, + "num_tokens": 763919638.0, + "step": 20026 + }, + { + "epoch": 2.547640249332146, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6831735372543335, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8564632534980774, + "num_tokens": 763958439.0, + "step": 20027 + }, + { + "epoch": 2.5477674596107365, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6517468690872192, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8733559250831604, + "num_tokens": 763998371.0, + "step": 20028 + }, + { + "epoch": 2.547894669889327, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.744608998298645, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.891609787940979, + "num_tokens": 764027726.0, + "step": 20029 + }, + { + "epoch": 2.5480218801679175, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6637520790100098, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8858349323272705, + "num_tokens": 764064911.0, + "step": 20030 + }, + { + "epoch": 2.548149090446508, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6004618406295776, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8696943521499634, + "num_tokens": 764110126.0, + "step": 20031 + }, + { + "epoch": 2.5482763007250986, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7298738956451416, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8621630668640137, + "num_tokens": 764148300.0, + "step": 20032 + }, + { + "epoch": 2.548403511003689, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.518698811531067, + "learning_rate": 1e-06, + "loss": 0.3335, + "mean_token_accuracy": 0.8899739384651184, + "num_tokens": 764187087.0, + "step": 20033 + }, + { + "epoch": 2.5485307212822796, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.590390920639038, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8842655420303345, + "num_tokens": 764228265.0, + "step": 20034 + }, + { + "epoch": 2.54865793156087, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7186107635498047, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8759210705757141, + "num_tokens": 764263385.0, + "step": 20035 + }, + { + "epoch": 2.5487851418394607, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.569243311882019, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8790557384490967, + "num_tokens": 764303385.0, + "step": 20036 + }, + { + "epoch": 2.5489123521180512, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7375987768173218, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8673602342605591, + "num_tokens": 764337669.0, + "step": 20037 + }, + { + "epoch": 2.5490395623966418, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.618969202041626, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8631455898284912, + "num_tokens": 764378248.0, + "step": 20038 + }, + { + "epoch": 2.5491667726752323, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.678516149520874, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8666889667510986, + "num_tokens": 764416884.0, + "step": 20039 + }, + { + "epoch": 2.549293982953823, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6471288204193115, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8685622811317444, + "num_tokens": 764457715.0, + "step": 20040 + }, + { + "epoch": 2.5494211932324133, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4793368577957153, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8742649555206299, + "num_tokens": 764504263.0, + "step": 20041 + }, + { + "epoch": 2.549548403511004, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5369476079940796, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.867348313331604, + "num_tokens": 764543392.0, + "step": 20042 + }, + { + "epoch": 2.549675613789594, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6788301467895508, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8768618106842041, + "num_tokens": 764581284.0, + "step": 20043 + }, + { + "epoch": 2.549802824068185, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.9487013816833496, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8546838760375977, + "num_tokens": 764614152.0, + "step": 20044 + }, + { + "epoch": 2.549930034346775, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.482068419456482, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8694893717765808, + "num_tokens": 764660961.0, + "step": 20045 + }, + { + "epoch": 2.550057244625366, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6104602813720703, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8811402320861816, + "num_tokens": 764699429.0, + "step": 20046 + }, + { + "epoch": 2.550184454903956, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6498222351074219, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8871141076087952, + "num_tokens": 764734087.0, + "step": 20047 + }, + { + "epoch": 2.5503116651825466, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8449774980545044, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8656109571456909, + "num_tokens": 764765975.0, + "step": 20048 + }, + { + "epoch": 2.550438875461137, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4313849210739136, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8839954137802124, + "num_tokens": 764809259.0, + "step": 20049 + }, + { + "epoch": 2.5505660857397277, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.528096318244934, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8658514022827148, + "num_tokens": 764852999.0, + "step": 20050 + }, + { + "epoch": 2.550693296018318, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.692131757736206, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8672012686729431, + "num_tokens": 764892276.0, + "step": 20051 + }, + { + "epoch": 2.5508205062969087, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5262699127197266, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8720657825469971, + "num_tokens": 764935386.0, + "step": 20052 + }, + { + "epoch": 2.5509477165754992, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6739654541015625, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8797980546951294, + "num_tokens": 764969223.0, + "step": 20053 + }, + { + "epoch": 2.5510749268540898, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4549063444137573, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8805779218673706, + "num_tokens": 765014734.0, + "step": 20054 + }, + { + "epoch": 2.5512021371326803, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5850186347961426, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8769882917404175, + "num_tokens": 765053950.0, + "step": 20055 + }, + { + "epoch": 2.551329347411271, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6982725858688354, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8803387880325317, + "num_tokens": 765092907.0, + "step": 20056 + }, + { + "epoch": 2.5514565576898613, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6924813985824585, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8542652130126953, + "num_tokens": 765136343.0, + "step": 20057 + }, + { + "epoch": 2.551583767968452, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7304517030715942, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8914103507995605, + "num_tokens": 765172283.0, + "step": 20058 + }, + { + "epoch": 2.5517109782470424, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6523659229278564, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8855999708175659, + "num_tokens": 765212191.0, + "step": 20059 + }, + { + "epoch": 2.551838188525633, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7054246664047241, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8602287769317627, + "num_tokens": 765250938.0, + "step": 20060 + }, + { + "epoch": 2.5519653988042235, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7853219509124756, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8658024072647095, + "num_tokens": 765287370.0, + "step": 20061 + }, + { + "epoch": 2.552092609082814, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.620225429534912, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8635188341140747, + "num_tokens": 765329689.0, + "step": 20062 + }, + { + "epoch": 2.5522198193614045, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6526587009429932, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8769733905792236, + "num_tokens": 765365054.0, + "step": 20063 + }, + { + "epoch": 2.552347029639995, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.806138277053833, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8767287731170654, + "num_tokens": 765394991.0, + "step": 20064 + }, + { + "epoch": 2.5524742399185856, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5368568897247314, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8774594068527222, + "num_tokens": 765433854.0, + "step": 20065 + }, + { + "epoch": 2.5526014501971757, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.700709342956543, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8731008172035217, + "num_tokens": 765469572.0, + "step": 20066 + }, + { + "epoch": 2.5527286604757666, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6584452390670776, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8700141310691833, + "num_tokens": 765508587.0, + "step": 20067 + }, + { + "epoch": 2.5528558707543567, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6478626728057861, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8723899126052856, + "num_tokens": 765548313.0, + "step": 20068 + }, + { + "epoch": 2.5529830810329477, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.46632981300354, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.887701153755188, + "num_tokens": 765591390.0, + "step": 20069 + }, + { + "epoch": 2.5531102913115378, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5937856435775757, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.87566077709198, + "num_tokens": 765633012.0, + "step": 20070 + }, + { + "epoch": 2.5532375015901287, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.526511549949646, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8752259612083435, + "num_tokens": 765675174.0, + "step": 20071 + }, + { + "epoch": 2.553364711868719, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.82542085647583, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8711728453636169, + "num_tokens": 765706745.0, + "step": 20072 + }, + { + "epoch": 2.5534919221473094, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7282158136367798, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8582351207733154, + "num_tokens": 765741968.0, + "step": 20073 + }, + { + "epoch": 2.5536191324259, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6154839992523193, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8765425086021423, + "num_tokens": 765779511.0, + "step": 20074 + }, + { + "epoch": 2.5537463427044904, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6610548496246338, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8651256561279297, + "num_tokens": 765817936.0, + "step": 20075 + }, + { + "epoch": 2.553873552983081, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6791808605194092, + "learning_rate": 1e-06, + "loss": 0.3312, + "mean_token_accuracy": 0.8919038772583008, + "num_tokens": 765855576.0, + "step": 20076 + }, + { + "epoch": 2.5540007632616715, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7223020792007446, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.861980676651001, + "num_tokens": 765893714.0, + "step": 20077 + }, + { + "epoch": 2.554127973540262, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6388568878173828, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8829126954078674, + "num_tokens": 765932334.0, + "step": 20078 + }, + { + "epoch": 2.5542551838188525, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6654764413833618, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8665466904640198, + "num_tokens": 765968692.0, + "step": 20079 + }, + { + "epoch": 2.554382394097443, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5316306352615356, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8833394050598145, + "num_tokens": 766013308.0, + "step": 20080 + }, + { + "epoch": 2.5545096043760336, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6283738613128662, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8819617629051208, + "num_tokens": 766051037.0, + "step": 20081 + }, + { + "epoch": 2.554636814654624, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.590212345123291, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8773932456970215, + "num_tokens": 766098821.0, + "step": 20082 + }, + { + "epoch": 2.5547640249332146, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.679610252380371, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8655484914779663, + "num_tokens": 766142032.0, + "step": 20083 + }, + { + "epoch": 2.554891235211805, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5648826360702515, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8718010187149048, + "num_tokens": 766182627.0, + "step": 20084 + }, + { + "epoch": 2.5550184454903957, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6793900728225708, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8705816864967346, + "num_tokens": 766220831.0, + "step": 20085 + }, + { + "epoch": 2.5551456557689862, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7788804769515991, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8788840770721436, + "num_tokens": 766254520.0, + "step": 20086 + }, + { + "epoch": 2.5552728660475768, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6898281574249268, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8808227777481079, + "num_tokens": 766292668.0, + "step": 20087 + }, + { + "epoch": 2.5554000763261673, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6583983898162842, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8757568597793579, + "num_tokens": 766332601.0, + "step": 20088 + }, + { + "epoch": 2.555527286604758, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6315560340881348, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8705452084541321, + "num_tokens": 766370909.0, + "step": 20089 + }, + { + "epoch": 2.5556544968833483, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6049870252609253, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8790537118911743, + "num_tokens": 766409025.0, + "step": 20090 + }, + { + "epoch": 2.5557817071619384, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6871020793914795, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8819372653961182, + "num_tokens": 766445664.0, + "step": 20091 + }, + { + "epoch": 2.5559089174405294, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6244573593139648, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8900575637817383, + "num_tokens": 766482612.0, + "step": 20092 + }, + { + "epoch": 2.5560361277191195, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.626668930053711, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8811701536178589, + "num_tokens": 766519500.0, + "step": 20093 + }, + { + "epoch": 2.5561633379977104, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7620596885681152, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8763357400894165, + "num_tokens": 766551056.0, + "step": 20094 + }, + { + "epoch": 2.5562905482763005, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5827975273132324, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8808071613311768, + "num_tokens": 766591017.0, + "step": 20095 + }, + { + "epoch": 2.5564177585548915, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.669521450996399, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8792420625686646, + "num_tokens": 766629093.0, + "step": 20096 + }, + { + "epoch": 2.5565449688334816, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.601252794265747, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8686911463737488, + "num_tokens": 766669022.0, + "step": 20097 + }, + { + "epoch": 2.556672179112072, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5500444173812866, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8761916160583496, + "num_tokens": 766709269.0, + "step": 20098 + }, + { + "epoch": 2.5567993893906626, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7009004354476929, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8848775625228882, + "num_tokens": 766748081.0, + "step": 20099 + }, + { + "epoch": 2.556926599669253, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7534562349319458, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8630950450897217, + "num_tokens": 766782285.0, + "step": 20100 + }, + { + "epoch": 2.5570538099478437, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.560408592224121, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8852155208587646, + "num_tokens": 766820389.0, + "step": 20101 + }, + { + "epoch": 2.5571810202264342, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7121778726577759, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8742424249649048, + "num_tokens": 766857391.0, + "step": 20102 + }, + { + "epoch": 2.5573082305050248, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.516245722770691, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8684571981430054, + "num_tokens": 766902301.0, + "step": 20103 + }, + { + "epoch": 2.5574354407836153, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5562002658843994, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8682392239570618, + "num_tokens": 766945729.0, + "step": 20104 + }, + { + "epoch": 2.557562651062206, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.733511209487915, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8676694631576538, + "num_tokens": 766980761.0, + "step": 20105 + }, + { + "epoch": 2.5576898613407963, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5668797492980957, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8818384408950806, + "num_tokens": 767018398.0, + "step": 20106 + }, + { + "epoch": 2.557817071619387, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4594429731369019, + "learning_rate": 1e-06, + "loss": 0.3262, + "mean_token_accuracy": 0.8926947116851807, + "num_tokens": 767062614.0, + "step": 20107 + }, + { + "epoch": 2.5579442818979774, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5550048351287842, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8734264373779297, + "num_tokens": 767102404.0, + "step": 20108 + }, + { + "epoch": 2.558071492176568, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6215260028839111, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8749984502792358, + "num_tokens": 767143259.0, + "step": 20109 + }, + { + "epoch": 2.5581987024551585, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6673444509506226, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8771212697029114, + "num_tokens": 767180179.0, + "step": 20110 + }, + { + "epoch": 2.558325912733749, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6369352340698242, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8729119300842285, + "num_tokens": 767216944.0, + "step": 20111 + }, + { + "epoch": 2.5584531230123395, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5604221820831299, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8788918256759644, + "num_tokens": 767256226.0, + "step": 20112 + }, + { + "epoch": 2.55858033329093, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.648710012435913, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8682001829147339, + "num_tokens": 767299386.0, + "step": 20113 + }, + { + "epoch": 2.5587075435695206, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4883850812911987, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8807085156440735, + "num_tokens": 767342539.0, + "step": 20114 + }, + { + "epoch": 2.558834753848111, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6130950450897217, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8763591051101685, + "num_tokens": 767381148.0, + "step": 20115 + }, + { + "epoch": 2.558961964126701, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.684950590133667, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8835486173629761, + "num_tokens": 767416537.0, + "step": 20116 + }, + { + "epoch": 2.559089174405292, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6809641122817993, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8675485253334045, + "num_tokens": 767453037.0, + "step": 20117 + }, + { + "epoch": 2.5592163846838822, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7760952711105347, + "learning_rate": 1e-06, + "loss": 0.3359, + "mean_token_accuracy": 0.8924883604049683, + "num_tokens": 767482621.0, + "step": 20118 + }, + { + "epoch": 2.559343594962473, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6773544549942017, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8760993480682373, + "num_tokens": 767521365.0, + "step": 20119 + }, + { + "epoch": 2.5594708052410633, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.739372968673706, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8687555193901062, + "num_tokens": 767557475.0, + "step": 20120 + }, + { + "epoch": 2.559598015519654, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.631701111793518, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8775240778923035, + "num_tokens": 767598039.0, + "step": 20121 + }, + { + "epoch": 2.5597252257982444, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6625932455062866, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8794827461242676, + "num_tokens": 767635414.0, + "step": 20122 + }, + { + "epoch": 2.559852436076835, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7115473747253418, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8539854288101196, + "num_tokens": 767673792.0, + "step": 20123 + }, + { + "epoch": 2.5599796463554254, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.606527328491211, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.877153754234314, + "num_tokens": 767713909.0, + "step": 20124 + }, + { + "epoch": 2.560106856634016, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7546688318252563, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8788546919822693, + "num_tokens": 767748447.0, + "step": 20125 + }, + { + "epoch": 2.5602340669126065, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7584370374679565, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.87166428565979, + "num_tokens": 767787424.0, + "step": 20126 + }, + { + "epoch": 2.560361277191197, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.815065622329712, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8885329961776733, + "num_tokens": 767817688.0, + "step": 20127 + }, + { + "epoch": 2.5604884874697875, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6034163236618042, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8671013712882996, + "num_tokens": 767865298.0, + "step": 20128 + }, + { + "epoch": 2.560615697748378, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7103245258331299, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8713735342025757, + "num_tokens": 767900838.0, + "step": 20129 + }, + { + "epoch": 2.5607429080269686, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8477678298950195, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.872546911239624, + "num_tokens": 767933341.0, + "step": 20130 + }, + { + "epoch": 2.560870118305559, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7281529903411865, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8794803619384766, + "num_tokens": 767969184.0, + "step": 20131 + }, + { + "epoch": 2.5609973285841496, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.789836049079895, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8795300722122192, + "num_tokens": 768006976.0, + "step": 20132 + }, + { + "epoch": 2.56112453886274, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6947388648986816, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8620190024375916, + "num_tokens": 768045992.0, + "step": 20133 + }, + { + "epoch": 2.5612517491413307, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6020442247390747, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8880771994590759, + "num_tokens": 768080184.0, + "step": 20134 + }, + { + "epoch": 2.561378959419921, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6979702711105347, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.877689778804779, + "num_tokens": 768114190.0, + "step": 20135 + }, + { + "epoch": 2.5615061696985117, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5505125522613525, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8748166561126709, + "num_tokens": 768155033.0, + "step": 20136 + }, + { + "epoch": 2.5616333799771023, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8304373025894165, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.858357846736908, + "num_tokens": 768187610.0, + "step": 20137 + }, + { + "epoch": 2.561760590255693, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6454026699066162, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8778561353683472, + "num_tokens": 768226898.0, + "step": 20138 + }, + { + "epoch": 2.5618878005342833, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6404887437820435, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8630749583244324, + "num_tokens": 768263990.0, + "step": 20139 + }, + { + "epoch": 2.562015010812874, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8221324682235718, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8487780690193176, + "num_tokens": 768297201.0, + "step": 20140 + }, + { + "epoch": 2.562142221091464, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.486849308013916, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8804042339324951, + "num_tokens": 768341508.0, + "step": 20141 + }, + { + "epoch": 2.562269431370055, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7687212228775024, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8756565451622009, + "num_tokens": 768373580.0, + "step": 20142 + }, + { + "epoch": 2.562396641648645, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5823776721954346, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8797253966331482, + "num_tokens": 768414567.0, + "step": 20143 + }, + { + "epoch": 2.562523851927236, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6307542324066162, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8730196356773376, + "num_tokens": 768454871.0, + "step": 20144 + }, + { + "epoch": 2.562651062205826, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7048696279525757, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8665310144424438, + "num_tokens": 768492539.0, + "step": 20145 + }, + { + "epoch": 2.5627782724844166, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.6857283115386963, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.864518940448761, + "num_tokens": 768529270.0, + "step": 20146 + }, + { + "epoch": 2.562905482763007, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.726102590560913, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8607621788978577, + "num_tokens": 768572520.0, + "step": 20147 + }, + { + "epoch": 2.5630326930415976, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6662036180496216, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.876766562461853, + "num_tokens": 768612366.0, + "step": 20148 + }, + { + "epoch": 2.563159903320188, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5913724899291992, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8689894080162048, + "num_tokens": 768653300.0, + "step": 20149 + }, + { + "epoch": 2.5632871135987787, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6484311819076538, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.861858606338501, + "num_tokens": 768692244.0, + "step": 20150 + }, + { + "epoch": 2.5634143238773692, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.477761149406433, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8713762760162354, + "num_tokens": 768735126.0, + "step": 20151 + }, + { + "epoch": 2.5635415341559598, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5699670314788818, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8825390338897705, + "num_tokens": 768771230.0, + "step": 20152 + }, + { + "epoch": 2.5636687444345503, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.559422492980957, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8873168230056763, + "num_tokens": 768810324.0, + "step": 20153 + }, + { + "epoch": 2.563795954713141, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6243962049484253, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8812884092330933, + "num_tokens": 768845772.0, + "step": 20154 + }, + { + "epoch": 2.5639231649917313, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.728078842163086, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8688980340957642, + "num_tokens": 768882667.0, + "step": 20155 + }, + { + "epoch": 2.564050375270322, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6025497913360596, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8808140754699707, + "num_tokens": 768915887.0, + "step": 20156 + }, + { + "epoch": 2.5641775855489124, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6740870475769043, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8799304962158203, + "num_tokens": 768954614.0, + "step": 20157 + }, + { + "epoch": 2.564304795827503, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7407515048980713, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.884688138961792, + "num_tokens": 768989679.0, + "step": 20158 + }, + { + "epoch": 2.5644320061060935, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.734750747680664, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.861966609954834, + "num_tokens": 769025335.0, + "step": 20159 + }, + { + "epoch": 2.564559216384684, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5511313676834106, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8791698217391968, + "num_tokens": 769063962.0, + "step": 20160 + }, + { + "epoch": 2.5646864266632745, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.577492594718933, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8692985773086548, + "num_tokens": 769105204.0, + "step": 20161 + }, + { + "epoch": 2.564813636941865, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7144851684570312, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8712096214294434, + "num_tokens": 769140697.0, + "step": 20162 + }, + { + "epoch": 2.5649408472204556, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6168608665466309, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.867424488067627, + "num_tokens": 769181743.0, + "step": 20163 + }, + { + "epoch": 2.5650680574990457, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.871779203414917, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.86961430311203, + "num_tokens": 769214954.0, + "step": 20164 + }, + { + "epoch": 2.5651952677776366, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7106287479400635, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8794666528701782, + "num_tokens": 769250169.0, + "step": 20165 + }, + { + "epoch": 2.5653224780562267, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5987446308135986, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8804802894592285, + "num_tokens": 769287451.0, + "step": 20166 + }, + { + "epoch": 2.5654496883348177, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6042964458465576, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8618175387382507, + "num_tokens": 769325227.0, + "step": 20167 + }, + { + "epoch": 2.5655768986134078, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4323840141296387, + "learning_rate": 1e-06, + "loss": 0.3488, + "mean_token_accuracy": 0.8860588073730469, + "num_tokens": 769367145.0, + "step": 20168 + }, + { + "epoch": 2.5657041088919987, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.586594820022583, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8800979852676392, + "num_tokens": 769403956.0, + "step": 20169 + }, + { + "epoch": 2.565831319170589, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6681228876113892, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8764520883560181, + "num_tokens": 769442604.0, + "step": 20170 + }, + { + "epoch": 2.5659585294491793, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6519172191619873, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8887114524841309, + "num_tokens": 769474717.0, + "step": 20171 + }, + { + "epoch": 2.56608573972777, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6461963653564453, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8820115923881531, + "num_tokens": 769507880.0, + "step": 20172 + }, + { + "epoch": 2.5662129500063604, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5741841793060303, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8736924529075623, + "num_tokens": 769547292.0, + "step": 20173 + }, + { + "epoch": 2.566340160284951, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7310038805007935, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8711646795272827, + "num_tokens": 769585500.0, + "step": 20174 + }, + { + "epoch": 2.5664673705635415, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6877626180648804, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8853363990783691, + "num_tokens": 769620016.0, + "step": 20175 + }, + { + "epoch": 2.566594580842132, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.583181619644165, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8830696940422058, + "num_tokens": 769659628.0, + "step": 20176 + }, + { + "epoch": 2.5667217911207225, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.609286904335022, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8777245283126831, + "num_tokens": 769699779.0, + "step": 20177 + }, + { + "epoch": 2.566849001399313, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6945111751556396, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8774557709693909, + "num_tokens": 769736301.0, + "step": 20178 + }, + { + "epoch": 2.5669762116779036, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5018527507781982, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8616687059402466, + "num_tokens": 769781273.0, + "step": 20179 + }, + { + "epoch": 2.567103421956494, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7459598779678345, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8663231730461121, + "num_tokens": 769815051.0, + "step": 20180 + }, + { + "epoch": 2.5672306322350846, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6306976079940796, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8743020296096802, + "num_tokens": 769848917.0, + "step": 20181 + }, + { + "epoch": 2.567357842513675, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6926798820495605, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8710954189300537, + "num_tokens": 769886851.0, + "step": 20182 + }, + { + "epoch": 2.5674850527922657, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5195566415786743, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.881633996963501, + "num_tokens": 769931457.0, + "step": 20183 + }, + { + "epoch": 2.567612263070856, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5480883121490479, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8766252994537354, + "num_tokens": 769969445.0, + "step": 20184 + }, + { + "epoch": 2.5677394733494467, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7285140752792358, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8742482662200928, + "num_tokens": 770012673.0, + "step": 20185 + }, + { + "epoch": 2.5678666836280373, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.668861985206604, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8710357546806335, + "num_tokens": 770053691.0, + "step": 20186 + }, + { + "epoch": 2.567993893906628, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6618539094924927, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8773042559623718, + "num_tokens": 770091461.0, + "step": 20187 + }, + { + "epoch": 2.5681211041852183, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.714867115020752, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.886682391166687, + "num_tokens": 770131864.0, + "step": 20188 + }, + { + "epoch": 2.5682483144638084, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7004005908966064, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8710538744926453, + "num_tokens": 770168253.0, + "step": 20189 + }, + { + "epoch": 2.5683755247423994, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6256905794143677, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8866360783576965, + "num_tokens": 770203174.0, + "step": 20190 + }, + { + "epoch": 2.5685027350209895, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4765182733535767, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.877976655960083, + "num_tokens": 770248108.0, + "step": 20191 + }, + { + "epoch": 2.5686299452995804, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7497972249984741, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8713211417198181, + "num_tokens": 770282823.0, + "step": 20192 + }, + { + "epoch": 2.5687571555781705, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7515982389450073, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8761674165725708, + "num_tokens": 770316655.0, + "step": 20193 + }, + { + "epoch": 2.5688843658567615, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7246042490005493, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8723661303520203, + "num_tokens": 770352797.0, + "step": 20194 + }, + { + "epoch": 2.5690115761353516, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.715451717376709, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.873390257358551, + "num_tokens": 770385578.0, + "step": 20195 + }, + { + "epoch": 2.569138786413942, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6600550413131714, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8865402340888977, + "num_tokens": 770422318.0, + "step": 20196 + }, + { + "epoch": 2.5692659966925326, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6626170873641968, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8748038411140442, + "num_tokens": 770458176.0, + "step": 20197 + }, + { + "epoch": 2.569393206971123, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6697250604629517, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8881180882453918, + "num_tokens": 770489982.0, + "step": 20198 + }, + { + "epoch": 2.5695204172497137, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5668922662734985, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8822219371795654, + "num_tokens": 770529113.0, + "step": 20199 + }, + { + "epoch": 2.5696476275283042, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6523517370224, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8792849779129028, + "num_tokens": 770568458.0, + "step": 20200 + }, + { + "epoch": 2.5697748378068948, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.799378752708435, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8699595928192139, + "num_tokens": 770608195.0, + "step": 20201 + }, + { + "epoch": 2.5699020480854853, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.615680456161499, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8750627040863037, + "num_tokens": 770644513.0, + "step": 20202 + }, + { + "epoch": 2.570029258364076, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7255606651306152, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.875051736831665, + "num_tokens": 770680105.0, + "step": 20203 + }, + { + "epoch": 2.5701564686426663, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6689280271530151, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8798240423202515, + "num_tokens": 770717752.0, + "step": 20204 + }, + { + "epoch": 2.570283678921257, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6288154125213623, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8856732845306396, + "num_tokens": 770756999.0, + "step": 20205 + }, + { + "epoch": 2.5704108891998474, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.5373483896255493, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8785229921340942, + "num_tokens": 770802534.0, + "step": 20206 + }, + { + "epoch": 2.570538099478438, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7776100635528564, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8603149652481079, + "num_tokens": 770838280.0, + "step": 20207 + }, + { + "epoch": 2.5706653097570284, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8318027257919312, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8664745688438416, + "num_tokens": 770871516.0, + "step": 20208 + }, + { + "epoch": 2.570792520035619, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.655053734779358, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8778328895568848, + "num_tokens": 770907799.0, + "step": 20209 + }, + { + "epoch": 2.5709197303142095, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6890228986740112, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8706158995628357, + "num_tokens": 770946514.0, + "step": 20210 + }, + { + "epoch": 2.5710469405928, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.603931188583374, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8805583715438843, + "num_tokens": 770984568.0, + "step": 20211 + }, + { + "epoch": 2.5711741508713906, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7665132284164429, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8717207312583923, + "num_tokens": 771017917.0, + "step": 20212 + }, + { + "epoch": 2.571301361149981, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5491222143173218, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8825833201408386, + "num_tokens": 771061388.0, + "step": 20213 + }, + { + "epoch": 2.571428571428571, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5873297452926636, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8682641983032227, + "num_tokens": 771103146.0, + "step": 20214 + }, + { + "epoch": 2.571555781707162, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.6555397510528564, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8760796785354614, + "num_tokens": 771138815.0, + "step": 20215 + }, + { + "epoch": 2.5716829919857522, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5638066530227661, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8716248273849487, + "num_tokens": 771179752.0, + "step": 20216 + }, + { + "epoch": 2.571810202264343, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.870523452758789, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8655709028244019, + "num_tokens": 771212877.0, + "step": 20217 + }, + { + "epoch": 2.5719374125429333, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.626281499862671, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8551234006881714, + "num_tokens": 771256429.0, + "step": 20218 + }, + { + "epoch": 2.572064622821524, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5243244171142578, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8747151494026184, + "num_tokens": 771300717.0, + "step": 20219 + }, + { + "epoch": 2.5721918331001143, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6446441411972046, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8657839298248291, + "num_tokens": 771336901.0, + "step": 20220 + }, + { + "epoch": 2.572319043378705, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6546381711959839, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8745635151863098, + "num_tokens": 771376322.0, + "step": 20221 + }, + { + "epoch": 2.5724462536572954, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7399059534072876, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8683860898017883, + "num_tokens": 771415188.0, + "step": 20222 + }, + { + "epoch": 2.572573463935886, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6726224422454834, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8661797642707825, + "num_tokens": 771455730.0, + "step": 20223 + }, + { + "epoch": 2.5727006742144765, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5802559852600098, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8815819025039673, + "num_tokens": 771494189.0, + "step": 20224 + }, + { + "epoch": 2.572827884493067, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4996938705444336, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8768028020858765, + "num_tokens": 771537306.0, + "step": 20225 + }, + { + "epoch": 2.5729550947716575, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.66140878200531, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.8872576355934143, + "num_tokens": 771571099.0, + "step": 20226 + }, + { + "epoch": 2.573082305050248, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6472055912017822, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8683369755744934, + "num_tokens": 771611565.0, + "step": 20227 + }, + { + "epoch": 2.5732095153288386, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5714865922927856, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8624517917633057, + "num_tokens": 771655753.0, + "step": 20228 + }, + { + "epoch": 2.573336725607429, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6083465814590454, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8667855858802795, + "num_tokens": 771699034.0, + "step": 20229 + }, + { + "epoch": 2.5734639358860196, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.703773856163025, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.857660174369812, + "num_tokens": 771739292.0, + "step": 20230 + }, + { + "epoch": 2.57359114616461, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.618269443511963, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.881869375705719, + "num_tokens": 771778911.0, + "step": 20231 + }, + { + "epoch": 2.5737183564432007, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5292598009109497, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8854461908340454, + "num_tokens": 771816359.0, + "step": 20232 + }, + { + "epoch": 2.573845566721791, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.582161784172058, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8735793828964233, + "num_tokens": 771857477.0, + "step": 20233 + }, + { + "epoch": 2.5739727770003817, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5180994272232056, + "learning_rate": 1e-06, + "loss": 0.32, + "mean_token_accuracy": 0.8954423069953918, + "num_tokens": 771894162.0, + "step": 20234 + }, + { + "epoch": 2.5740999872789723, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.619403600692749, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8647783994674683, + "num_tokens": 771932371.0, + "step": 20235 + }, + { + "epoch": 2.574227197557563, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5696439743041992, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8906756043434143, + "num_tokens": 771970483.0, + "step": 20236 + }, + { + "epoch": 2.5743544078361533, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6820849180221558, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8861257433891296, + "num_tokens": 772004184.0, + "step": 20237 + }, + { + "epoch": 2.574481618114744, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.676491141319275, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.868306577205658, + "num_tokens": 772041515.0, + "step": 20238 + }, + { + "epoch": 2.574608828393334, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7741198539733887, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8623945713043213, + "num_tokens": 772079975.0, + "step": 20239 + }, + { + "epoch": 2.574736038671925, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6254022121429443, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.877197802066803, + "num_tokens": 772120135.0, + "step": 20240 + }, + { + "epoch": 2.574863248950515, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5830533504486084, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8778077363967896, + "num_tokens": 772160746.0, + "step": 20241 + }, + { + "epoch": 2.574990459229106, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5649986267089844, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8803004622459412, + "num_tokens": 772200100.0, + "step": 20242 + }, + { + "epoch": 2.575117669507696, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6633296012878418, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8750678300857544, + "num_tokens": 772238390.0, + "step": 20243 + }, + { + "epoch": 2.5752448797862866, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4847698211669922, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8724939823150635, + "num_tokens": 772282955.0, + "step": 20244 + }, + { + "epoch": 2.575372090064877, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5455518960952759, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8869500160217285, + "num_tokens": 772323760.0, + "step": 20245 + }, + { + "epoch": 2.5754993003434676, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6390767097473145, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8710535168647766, + "num_tokens": 772363248.0, + "step": 20246 + }, + { + "epoch": 2.575626510622058, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.652130365371704, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8849388360977173, + "num_tokens": 772401481.0, + "step": 20247 + }, + { + "epoch": 2.5757537209006487, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6856144666671753, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8729434609413147, + "num_tokens": 772439742.0, + "step": 20248 + }, + { + "epoch": 2.575880931179239, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7119311094284058, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8739588856697083, + "num_tokens": 772473531.0, + "step": 20249 + }, + { + "epoch": 2.5760081414578297, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6600620746612549, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8741073608398438, + "num_tokens": 772512480.0, + "step": 20250 + }, + { + "epoch": 2.5761353517364203, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6662719249725342, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8772319555282593, + "num_tokens": 772551513.0, + "step": 20251 + }, + { + "epoch": 2.576262562015011, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5645339488983154, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8831751346588135, + "num_tokens": 772590934.0, + "step": 20252 + }, + { + "epoch": 2.5763897722936013, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 2.275005578994751, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8807662725448608, + "num_tokens": 772626037.0, + "step": 20253 + }, + { + "epoch": 2.576516982572192, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5768318176269531, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8820067644119263, + "num_tokens": 772666940.0, + "step": 20254 + }, + { + "epoch": 2.5766441928507824, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7428056001663208, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.882946252822876, + "num_tokens": 772698794.0, + "step": 20255 + }, + { + "epoch": 2.576771403129373, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5076335668563843, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8805426359176636, + "num_tokens": 772741915.0, + "step": 20256 + }, + { + "epoch": 2.5768986134079634, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.673439621925354, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8757470846176147, + "num_tokens": 772774657.0, + "step": 20257 + }, + { + "epoch": 2.577025823686554, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6296366453170776, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8643872737884521, + "num_tokens": 772814130.0, + "step": 20258 + }, + { + "epoch": 2.5771530339651445, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6959360837936401, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.859383225440979, + "num_tokens": 772856788.0, + "step": 20259 + }, + { + "epoch": 2.577280244243735, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.665173888206482, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8784998655319214, + "num_tokens": 772893699.0, + "step": 20260 + }, + { + "epoch": 2.5774074545223256, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.751865029335022, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8534034490585327, + "num_tokens": 772929970.0, + "step": 20261 + }, + { + "epoch": 2.5775346648009156, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7291392087936401, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8769826889038086, + "num_tokens": 772968264.0, + "step": 20262 + }, + { + "epoch": 2.5776618750795066, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7894740104675293, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8622756600379944, + "num_tokens": 773001211.0, + "step": 20263 + }, + { + "epoch": 2.5777890853580967, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6987384557724, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8634216785430908, + "num_tokens": 773038124.0, + "step": 20264 + }, + { + "epoch": 2.5779162956366877, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8592218160629272, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8689025640487671, + "num_tokens": 773072324.0, + "step": 20265 + }, + { + "epoch": 2.5780435059152778, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.697737216949463, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8648970723152161, + "num_tokens": 773105415.0, + "step": 20266 + }, + { + "epoch": 2.5781707161938687, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5839303731918335, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8641740083694458, + "num_tokens": 773148202.0, + "step": 20267 + }, + { + "epoch": 2.578297926472459, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6662015914916992, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8696314692497253, + "num_tokens": 773183496.0, + "step": 20268 + }, + { + "epoch": 2.5784251367510493, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7105025053024292, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8634069561958313, + "num_tokens": 773224559.0, + "step": 20269 + }, + { + "epoch": 2.57855234702964, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 2.256622314453125, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8740646243095398, + "num_tokens": 773263092.0, + "step": 20270 + }, + { + "epoch": 2.5786795573082304, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5383853912353516, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8642637729644775, + "num_tokens": 773306865.0, + "step": 20271 + }, + { + "epoch": 2.578806767586821, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.579600214958191, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8665988445281982, + "num_tokens": 773351703.0, + "step": 20272 + }, + { + "epoch": 2.5789339778654115, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6675342321395874, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8755331039428711, + "num_tokens": 773390047.0, + "step": 20273 + }, + { + "epoch": 2.579061188144002, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4718881845474243, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8771945238113403, + "num_tokens": 773432722.0, + "step": 20274 + }, + { + "epoch": 2.5791883984225925, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5613218545913696, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8713769316673279, + "num_tokens": 773472611.0, + "step": 20275 + }, + { + "epoch": 2.579315608701183, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.731680989265442, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.870760977268219, + "num_tokens": 773511884.0, + "step": 20276 + }, + { + "epoch": 2.5794428189797736, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7452818155288696, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8633934259414673, + "num_tokens": 773547318.0, + "step": 20277 + }, + { + "epoch": 2.579570029258364, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.687178373336792, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8677961826324463, + "num_tokens": 773584052.0, + "step": 20278 + }, + { + "epoch": 2.5796972395369546, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7139662504196167, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8692494630813599, + "num_tokens": 773617936.0, + "step": 20279 + }, + { + "epoch": 2.579824449815545, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.594535231590271, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8630837798118591, + "num_tokens": 773661214.0, + "step": 20280 + }, + { + "epoch": 2.5799516600941357, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5058387517929077, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8805252313613892, + "num_tokens": 773702390.0, + "step": 20281 + }, + { + "epoch": 2.580078870372726, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6713933944702148, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8803660869598389, + "num_tokens": 773738417.0, + "step": 20282 + }, + { + "epoch": 2.5802060806513167, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5394256114959717, + "learning_rate": 1e-06, + "loss": 0.338, + "mean_token_accuracy": 0.8907454013824463, + "num_tokens": 773774579.0, + "step": 20283 + }, + { + "epoch": 2.5803332909299073, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7095459699630737, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8779438734054565, + "num_tokens": 773809961.0, + "step": 20284 + }, + { + "epoch": 2.580460501208498, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6178778409957886, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8659927845001221, + "num_tokens": 773852714.0, + "step": 20285 + }, + { + "epoch": 2.5805877114870883, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.599554419517517, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8802942037582397, + "num_tokens": 773887201.0, + "step": 20286 + }, + { + "epoch": 2.5807149217656784, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7829703092575073, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8829147815704346, + "num_tokens": 773917707.0, + "step": 20287 + }, + { + "epoch": 2.5808421320442694, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6805084943771362, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8671918511390686, + "num_tokens": 773954273.0, + "step": 20288 + }, + { + "epoch": 2.5809693423228595, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5943777561187744, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.88387531042099, + "num_tokens": 773997916.0, + "step": 20289 + }, + { + "epoch": 2.5810965526014504, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7183655500411987, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8712155222892761, + "num_tokens": 774035128.0, + "step": 20290 + }, + { + "epoch": 2.5812237628800405, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7122467756271362, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8589576482772827, + "num_tokens": 774071951.0, + "step": 20291 + }, + { + "epoch": 2.5813509731586315, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7507500648498535, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8723939657211304, + "num_tokens": 774102947.0, + "step": 20292 + }, + { + "epoch": 2.5814781834372216, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5918813943862915, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.882725715637207, + "num_tokens": 774141929.0, + "step": 20293 + }, + { + "epoch": 2.581605393715812, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.658542513847351, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8686495423316956, + "num_tokens": 774181342.0, + "step": 20294 + }, + { + "epoch": 2.5817326039944026, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.62692391872406, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8722963333129883, + "num_tokens": 774217342.0, + "step": 20295 + }, + { + "epoch": 2.581859814272993, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6129534244537354, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8873237371444702, + "num_tokens": 774254683.0, + "step": 20296 + }, + { + "epoch": 2.5819870245515837, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4990918636322021, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8689192533493042, + "num_tokens": 774302116.0, + "step": 20297 + }, + { + "epoch": 2.582114234830174, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6584060192108154, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8779839277267456, + "num_tokens": 774339321.0, + "step": 20298 + }, + { + "epoch": 2.5822414451087647, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6758095026016235, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8857092261314392, + "num_tokens": 774374677.0, + "step": 20299 + }, + { + "epoch": 2.5823686553873553, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6688028573989868, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8615310192108154, + "num_tokens": 774417729.0, + "step": 20300 + }, + { + "epoch": 2.582495865665946, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.9202240705490112, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8757232427597046, + "num_tokens": 774449375.0, + "step": 20301 + }, + { + "epoch": 2.5826230759445363, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7209259271621704, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8810300827026367, + "num_tokens": 774481533.0, + "step": 20302 + }, + { + "epoch": 2.582750286223127, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.692902684211731, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8777983784675598, + "num_tokens": 774520526.0, + "step": 20303 + }, + { + "epoch": 2.5828774965017174, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7090919017791748, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8744038343429565, + "num_tokens": 774559594.0, + "step": 20304 + }, + { + "epoch": 2.583004706780308, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 2.13914155960083, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8866397738456726, + "num_tokens": 774591278.0, + "step": 20305 + }, + { + "epoch": 2.5831319170588984, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.5359697341918945, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8656455278396606, + "num_tokens": 774639477.0, + "step": 20306 + }, + { + "epoch": 2.583259127337489, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.738865613937378, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8722870349884033, + "num_tokens": 774673235.0, + "step": 20307 + }, + { + "epoch": 2.5833863376160795, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.65084707736969, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8629967570304871, + "num_tokens": 774713538.0, + "step": 20308 + }, + { + "epoch": 2.58351354789467, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7584952116012573, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.874845027923584, + "num_tokens": 774750603.0, + "step": 20309 + }, + { + "epoch": 2.5836407581732606, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5155606269836426, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8822672963142395, + "num_tokens": 774791812.0, + "step": 20310 + }, + { + "epoch": 2.583767968451851, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.54335618019104, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8783276677131653, + "num_tokens": 774833452.0, + "step": 20311 + }, + { + "epoch": 2.583895178730441, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.572928547859192, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8793347477912903, + "num_tokens": 774875126.0, + "step": 20312 + }, + { + "epoch": 2.584022389009032, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6406912803649902, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.873801589012146, + "num_tokens": 774913240.0, + "step": 20313 + }, + { + "epoch": 2.5841495992876222, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5888725519180298, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8912523984909058, + "num_tokens": 774946404.0, + "step": 20314 + }, + { + "epoch": 2.584276809566213, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5553147792816162, + "learning_rate": 1e-06, + "loss": 0.3314, + "mean_token_accuracy": 0.8915460109710693, + "num_tokens": 774983613.0, + "step": 20315 + }, + { + "epoch": 2.5844040198448033, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.616366982460022, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8852176666259766, + "num_tokens": 775019242.0, + "step": 20316 + }, + { + "epoch": 2.584531230123394, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.707074522972107, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8811156153678894, + "num_tokens": 775050610.0, + "step": 20317 + }, + { + "epoch": 2.5846584404019843, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6543277502059937, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8706507682800293, + "num_tokens": 775087880.0, + "step": 20318 + }, + { + "epoch": 2.584785650680575, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.744814395904541, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8631741404533386, + "num_tokens": 775126912.0, + "step": 20319 + }, + { + "epoch": 2.5849128609591654, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.673803448677063, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8817228674888611, + "num_tokens": 775160157.0, + "step": 20320 + }, + { + "epoch": 2.585040071237756, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.563335657119751, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8838523030281067, + "num_tokens": 775198693.0, + "step": 20321 + }, + { + "epoch": 2.5851672815163464, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.662580132484436, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8834826946258545, + "num_tokens": 775234717.0, + "step": 20322 + }, + { + "epoch": 2.585294491794937, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.6804440021514893, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8858339190483093, + "num_tokens": 775271455.0, + "step": 20323 + }, + { + "epoch": 2.5854217020735275, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.7618544101715088, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.870823860168457, + "num_tokens": 775308476.0, + "step": 20324 + }, + { + "epoch": 2.585548912352118, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.76557457447052, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.876424252986908, + "num_tokens": 775346618.0, + "step": 20325 + }, + { + "epoch": 2.5856761226307086, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 3.5924930572509766, + "learning_rate": 1e-06, + "loss": 0.3249, + "mean_token_accuracy": 0.8956403136253357, + "num_tokens": 775386523.0, + "step": 20326 + }, + { + "epoch": 2.585803332909299, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.7081044912338257, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8766745328903198, + "num_tokens": 775425724.0, + "step": 20327 + }, + { + "epoch": 2.5859305431878896, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7247264385223389, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8745720386505127, + "num_tokens": 775462811.0, + "step": 20328 + }, + { + "epoch": 2.58605775346648, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.5671792030334473, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8779246807098389, + "num_tokens": 775504966.0, + "step": 20329 + }, + { + "epoch": 2.5861849637450707, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7088981866836548, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8638347387313843, + "num_tokens": 775539709.0, + "step": 20330 + }, + { + "epoch": 2.586312174023661, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.8696860074996948, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8845349550247192, + "num_tokens": 775572710.0, + "step": 20331 + }, + { + "epoch": 2.5864393843022517, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.731824278831482, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8829691410064697, + "num_tokens": 775607663.0, + "step": 20332 + }, + { + "epoch": 2.5865665945808423, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.6900368928909302, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8698683977127075, + "num_tokens": 775644656.0, + "step": 20333 + }, + { + "epoch": 2.586693804859433, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.5991250276565552, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8678824305534363, + "num_tokens": 775684173.0, + "step": 20334 + }, + { + "epoch": 2.5868210151380233, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.5692375898361206, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8870290517807007, + "num_tokens": 775727151.0, + "step": 20335 + }, + { + "epoch": 2.586948225416614, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.8337258100509644, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8713904023170471, + "num_tokens": 775759485.0, + "step": 20336 + }, + { + "epoch": 2.587075435695204, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6851227283477783, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8715509176254272, + "num_tokens": 775799013.0, + "step": 20337 + }, + { + "epoch": 2.587202645973795, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.763110876083374, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8870605230331421, + "num_tokens": 775833132.0, + "step": 20338 + }, + { + "epoch": 2.587329856252385, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.696839690208435, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8677850365638733, + "num_tokens": 775870627.0, + "step": 20339 + }, + { + "epoch": 2.587457066530976, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.5744872093200684, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8843015432357788, + "num_tokens": 775905684.0, + "step": 20340 + }, + { + "epoch": 2.587584276809566, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.716423511505127, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8666258454322815, + "num_tokens": 775946044.0, + "step": 20341 + }, + { + "epoch": 2.5877114870881566, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.5517945289611816, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8804838061332703, + "num_tokens": 775984862.0, + "step": 20342 + }, + { + "epoch": 2.587838697366747, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.718421459197998, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.881997287273407, + "num_tokens": 776019147.0, + "step": 20343 + }, + { + "epoch": 2.5879659076453376, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.591398000717163, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8792123198509216, + "num_tokens": 776058872.0, + "step": 20344 + }, + { + "epoch": 2.588093117923928, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.613741159439087, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8796373605728149, + "num_tokens": 776097453.0, + "step": 20345 + }, + { + "epoch": 2.5882203282025187, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6743382215499878, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8721812963485718, + "num_tokens": 776134494.0, + "step": 20346 + }, + { + "epoch": 2.588347538481109, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.7537498474121094, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8688744306564331, + "num_tokens": 776172987.0, + "step": 20347 + }, + { + "epoch": 2.5884747487596997, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.7149468660354614, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8792362213134766, + "num_tokens": 776208613.0, + "step": 20348 + }, + { + "epoch": 2.5886019590382903, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.9294921159744263, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8623642921447754, + "num_tokens": 776245709.0, + "step": 20349 + }, + { + "epoch": 2.588729169316881, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7110466957092285, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8861557245254517, + "num_tokens": 776281043.0, + "step": 20350 + }, + { + "epoch": 2.5888563795954713, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5556129217147827, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.873546838760376, + "num_tokens": 776323232.0, + "step": 20351 + }, + { + "epoch": 2.588983589874062, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5554394721984863, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8877884149551392, + "num_tokens": 776363885.0, + "step": 20352 + }, + { + "epoch": 2.5891108001526524, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6200913190841675, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8715333342552185, + "num_tokens": 776401406.0, + "step": 20353 + }, + { + "epoch": 2.589238010431243, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.5684983730316162, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8806923627853394, + "num_tokens": 776443330.0, + "step": 20354 + }, + { + "epoch": 2.5893652207098334, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.717991828918457, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8719627857208252, + "num_tokens": 776480290.0, + "step": 20355 + }, + { + "epoch": 2.589492430988424, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7032629251480103, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8771580457687378, + "num_tokens": 776513967.0, + "step": 20356 + }, + { + "epoch": 2.5896196412670145, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6780550479888916, + "learning_rate": 1e-06, + "loss": 0.3415, + "mean_token_accuracy": 0.8879028558731079, + "num_tokens": 776553573.0, + "step": 20357 + }, + { + "epoch": 2.589746851545605, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6595184803009033, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8748331069946289, + "num_tokens": 776589702.0, + "step": 20358 + }, + { + "epoch": 2.5898740618241956, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5549226999282837, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8816333413124084, + "num_tokens": 776631284.0, + "step": 20359 + }, + { + "epoch": 2.5900012721027856, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7315959930419922, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8656080365180969, + "num_tokens": 776663077.0, + "step": 20360 + }, + { + "epoch": 2.5901284823813766, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.677505612373352, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8725450038909912, + "num_tokens": 776700748.0, + "step": 20361 + }, + { + "epoch": 2.5902556926599667, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7598131895065308, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.885967493057251, + "num_tokens": 776730168.0, + "step": 20362 + }, + { + "epoch": 2.5903829029385577, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5263949632644653, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8720637559890747, + "num_tokens": 776773546.0, + "step": 20363 + }, + { + "epoch": 2.5905101132171477, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6499236822128296, + "learning_rate": 1e-06, + "loss": 0.3483, + "mean_token_accuracy": 0.8857121467590332, + "num_tokens": 776809726.0, + "step": 20364 + }, + { + "epoch": 2.5906373234957387, + "ewc_loss": 2.872943878173828e-05, + "grad_norm": 1.7567758560180664, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8703073263168335, + "num_tokens": 776846097.0, + "step": 20365 + }, + { + "epoch": 2.590764533774329, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.743910312652588, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8608112335205078, + "num_tokens": 776885421.0, + "step": 20366 + }, + { + "epoch": 2.5908917440529193, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6287535429000854, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8710057735443115, + "num_tokens": 776929810.0, + "step": 20367 + }, + { + "epoch": 2.59101895433151, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.80716073513031, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8749991655349731, + "num_tokens": 776961574.0, + "step": 20368 + }, + { + "epoch": 2.5911461646101004, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5392632484436035, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8799291253089905, + "num_tokens": 777001617.0, + "step": 20369 + }, + { + "epoch": 2.591273374888691, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6614913940429688, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8611879944801331, + "num_tokens": 777043310.0, + "step": 20370 + }, + { + "epoch": 2.5914005851672814, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6594241857528687, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8875887393951416, + "num_tokens": 777079079.0, + "step": 20371 + }, + { + "epoch": 2.591527795445872, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.707960605621338, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8657368421554565, + "num_tokens": 777115300.0, + "step": 20372 + }, + { + "epoch": 2.5916550057244625, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5999819040298462, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8765667676925659, + "num_tokens": 777154353.0, + "step": 20373 + }, + { + "epoch": 2.591782216003053, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6219738721847534, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8813774585723877, + "num_tokens": 777189192.0, + "step": 20374 + }, + { + "epoch": 2.5919094262816436, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7888926267623901, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8739495873451233, + "num_tokens": 777227687.0, + "step": 20375 + }, + { + "epoch": 2.592036636560234, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6872540712356567, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8594459295272827, + "num_tokens": 777266822.0, + "step": 20376 + }, + { + "epoch": 2.5921638468388246, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7437243461608887, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8600220680236816, + "num_tokens": 777302678.0, + "step": 20377 + }, + { + "epoch": 2.592291057117415, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6391443014144897, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.87464839220047, + "num_tokens": 777338618.0, + "step": 20378 + }, + { + "epoch": 2.5924182673960057, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6382430791854858, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8799168467521667, + "num_tokens": 777374303.0, + "step": 20379 + }, + { + "epoch": 2.592545477674596, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7183969020843506, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8694469928741455, + "num_tokens": 777411091.0, + "step": 20380 + }, + { + "epoch": 2.5926726879531867, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7554314136505127, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8541730642318726, + "num_tokens": 777456542.0, + "step": 20381 + }, + { + "epoch": 2.5927998982317773, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7243506908416748, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8845613598823547, + "num_tokens": 777493292.0, + "step": 20382 + }, + { + "epoch": 2.592927108510368, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6838247776031494, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8664473295211792, + "num_tokens": 777536366.0, + "step": 20383 + }, + { + "epoch": 2.5930543187889583, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7340011596679688, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8918238878250122, + "num_tokens": 777567017.0, + "step": 20384 + }, + { + "epoch": 2.5931815290675484, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.814016580581665, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8769662380218506, + "num_tokens": 777602118.0, + "step": 20385 + }, + { + "epoch": 2.5933087393461394, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7064093351364136, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8802683353424072, + "num_tokens": 777634156.0, + "step": 20386 + }, + { + "epoch": 2.5934359496247295, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6776210069656372, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.8851715326309204, + "num_tokens": 777672468.0, + "step": 20387 + }, + { + "epoch": 2.5935631599033204, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6621136665344238, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8704018592834473, + "num_tokens": 777712790.0, + "step": 20388 + }, + { + "epoch": 2.5936903701819105, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6706782579421997, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8713619709014893, + "num_tokens": 777751203.0, + "step": 20389 + }, + { + "epoch": 2.5938175804605015, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5897566080093384, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8748962879180908, + "num_tokens": 777789948.0, + "step": 20390 + }, + { + "epoch": 2.5939447907390916, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.70911705493927, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8687995672225952, + "num_tokens": 777825085.0, + "step": 20391 + }, + { + "epoch": 2.594072001017682, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.641988754272461, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8824339509010315, + "num_tokens": 777861879.0, + "step": 20392 + }, + { + "epoch": 2.5941992112962726, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4940739870071411, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8817110061645508, + "num_tokens": 777903966.0, + "step": 20393 + }, + { + "epoch": 2.594326421574863, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7167266607284546, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8589620590209961, + "num_tokens": 777945849.0, + "step": 20394 + }, + { + "epoch": 2.5944536318534537, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.568442702293396, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8836078643798828, + "num_tokens": 777982360.0, + "step": 20395 + }, + { + "epoch": 2.594580842132044, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5435014963150024, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8711615800857544, + "num_tokens": 778025920.0, + "step": 20396 + }, + { + "epoch": 2.5947080524106347, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.687023639678955, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8551368713378906, + "num_tokens": 778067633.0, + "step": 20397 + }, + { + "epoch": 2.5948352626892253, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6510893106460571, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8801304697990417, + "num_tokens": 778101268.0, + "step": 20398 + }, + { + "epoch": 2.594962472967816, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5783751010894775, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8750836849212646, + "num_tokens": 778142587.0, + "step": 20399 + }, + { + "epoch": 2.5950896832464063, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6716333627700806, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8687117099761963, + "num_tokens": 778179399.0, + "step": 20400 + }, + { + "epoch": 2.595216893524997, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7218775749206543, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8714464902877808, + "num_tokens": 778215178.0, + "step": 20401 + }, + { + "epoch": 2.5953441038035874, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.656867265701294, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8775904178619385, + "num_tokens": 778249688.0, + "step": 20402 + }, + { + "epoch": 2.595471314082178, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.496592402458191, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8812023401260376, + "num_tokens": 778293673.0, + "step": 20403 + }, + { + "epoch": 2.5955985243607684, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7316993474960327, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.850824773311615, + "num_tokens": 778333297.0, + "step": 20404 + }, + { + "epoch": 2.595725734639359, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.653987169265747, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8900729417800903, + "num_tokens": 778368329.0, + "step": 20405 + }, + { + "epoch": 2.5958529449179495, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5458319187164307, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8755244016647339, + "num_tokens": 778410328.0, + "step": 20406 + }, + { + "epoch": 2.59598015519654, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6420564651489258, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8660483956336975, + "num_tokens": 778451578.0, + "step": 20407 + }, + { + "epoch": 2.5961073654751305, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5229766368865967, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.884803295135498, + "num_tokens": 778493845.0, + "step": 20408 + }, + { + "epoch": 2.596234575753721, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6893775463104248, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.877855658531189, + "num_tokens": 778528692.0, + "step": 20409 + }, + { + "epoch": 2.596361786032311, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5695924758911133, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8878921270370483, + "num_tokens": 778570841.0, + "step": 20410 + }, + { + "epoch": 2.596488996310902, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6831681728363037, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8715243339538574, + "num_tokens": 778612017.0, + "step": 20411 + }, + { + "epoch": 2.596616206589492, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.8974367380142212, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8745774626731873, + "num_tokens": 778644811.0, + "step": 20412 + }, + { + "epoch": 2.596743416868083, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6740707159042358, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8766422867774963, + "num_tokens": 778680179.0, + "step": 20413 + }, + { + "epoch": 2.5968706271466733, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7403417825698853, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8669456243515015, + "num_tokens": 778716711.0, + "step": 20414 + }, + { + "epoch": 2.596997837425264, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5599055290222168, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8844937086105347, + "num_tokens": 778758524.0, + "step": 20415 + }, + { + "epoch": 2.5971250477038543, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7228963375091553, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8846116065979004, + "num_tokens": 778792896.0, + "step": 20416 + }, + { + "epoch": 2.597252257982445, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.682944416999817, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8780801296234131, + "num_tokens": 778833393.0, + "step": 20417 + }, + { + "epoch": 2.5973794682610354, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7355223894119263, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8668121099472046, + "num_tokens": 778868297.0, + "step": 20418 + }, + { + "epoch": 2.597506678539626, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.62319016456604, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8843116760253906, + "num_tokens": 778903568.0, + "step": 20419 + }, + { + "epoch": 2.5976338888182164, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.781466007232666, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8627957105636597, + "num_tokens": 778935514.0, + "step": 20420 + }, + { + "epoch": 2.597761099096807, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6165519952774048, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8705715537071228, + "num_tokens": 778975021.0, + "step": 20421 + }, + { + "epoch": 2.5978883093753975, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7128331661224365, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8592685461044312, + "num_tokens": 779013302.0, + "step": 20422 + }, + { + "epoch": 2.598015519653988, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.8399091958999634, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8708476424217224, + "num_tokens": 779048963.0, + "step": 20423 + }, + { + "epoch": 2.5981427299325786, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7954753637313843, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8714181184768677, + "num_tokens": 779079806.0, + "step": 20424 + }, + { + "epoch": 2.598269940211169, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.676334023475647, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8749778866767883, + "num_tokens": 779116664.0, + "step": 20425 + }, + { + "epoch": 2.5983971504897596, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.8282307386398315, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.855579674243927, + "num_tokens": 779152276.0, + "step": 20426 + }, + { + "epoch": 2.59852436076835, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.625980019569397, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8764309883117676, + "num_tokens": 779189271.0, + "step": 20427 + }, + { + "epoch": 2.5986515710469407, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.5787571668624878, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8669712543487549, + "num_tokens": 779232231.0, + "step": 20428 + }, + { + "epoch": 2.598778781325531, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.4749269485473633, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8821241855621338, + "num_tokens": 779279376.0, + "step": 20429 + }, + { + "epoch": 2.5989059916041217, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.666567325592041, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8842161893844604, + "num_tokens": 779317902.0, + "step": 20430 + }, + { + "epoch": 2.5990332018827123, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6267658472061157, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8630324602127075, + "num_tokens": 779362393.0, + "step": 20431 + }, + { + "epoch": 2.599160412161303, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6188459396362305, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8799463510513306, + "num_tokens": 779401275.0, + "step": 20432 + }, + { + "epoch": 2.5992876224398933, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.6556732654571533, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8795113563537598, + "num_tokens": 779437258.0, + "step": 20433 + }, + { + "epoch": 2.599414832718484, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.4309883117675781, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8871909379959106, + "num_tokens": 779483451.0, + "step": 20434 + }, + { + "epoch": 2.599542042997074, + "ewc_loss": 2.849102020263672e-05, + "grad_norm": 1.7204906940460205, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8878934383392334, + "num_tokens": 779518594.0, + "step": 20435 + }, + { + "epoch": 2.599669253275665, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6734683513641357, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8692810535430908, + "num_tokens": 779554995.0, + "step": 20436 + }, + { + "epoch": 2.599796463554255, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.8757964372634888, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8697692155838013, + "num_tokens": 779586926.0, + "step": 20437 + }, + { + "epoch": 2.599923673832846, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7303208112716675, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8729002475738525, + "num_tokens": 779621993.0, + "step": 20438 + }, + { + "epoch": 2.600050884111436, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6947014331817627, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.882078230381012, + "num_tokens": 779655328.0, + "step": 20439 + }, + { + "epoch": 2.6001780943900266, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6800206899642944, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8673471212387085, + "num_tokens": 779692439.0, + "step": 20440 + }, + { + "epoch": 2.600305304668617, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.635010838508606, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8802754878997803, + "num_tokens": 779730571.0, + "step": 20441 + }, + { + "epoch": 2.6004325149472076, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7436110973358154, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8759494423866272, + "num_tokens": 779763371.0, + "step": 20442 + }, + { + "epoch": 2.600559725225798, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5614784955978394, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8682560920715332, + "num_tokens": 779803095.0, + "step": 20443 + }, + { + "epoch": 2.6006869355043887, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5565890073776245, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.879509449005127, + "num_tokens": 779842597.0, + "step": 20444 + }, + { + "epoch": 2.600814145782979, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.664436936378479, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8714627027511597, + "num_tokens": 779884559.0, + "step": 20445 + }, + { + "epoch": 2.6009413560615697, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6636955738067627, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8832526206970215, + "num_tokens": 779920020.0, + "step": 20446 + }, + { + "epoch": 2.6010685663401603, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6842695474624634, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8681204319000244, + "num_tokens": 779956146.0, + "step": 20447 + }, + { + "epoch": 2.601195776618751, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7143323421478271, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.85529625415802, + "num_tokens": 779994343.0, + "step": 20448 + }, + { + "epoch": 2.6013229868973413, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7487549781799316, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8780726194381714, + "num_tokens": 780027552.0, + "step": 20449 + }, + { + "epoch": 2.601450197175932, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6836539506912231, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.878146767616272, + "num_tokens": 780063011.0, + "step": 20450 + }, + { + "epoch": 2.6015774074545224, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5275686979293823, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8831914663314819, + "num_tokens": 780106573.0, + "step": 20451 + }, + { + "epoch": 2.601704617733113, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6161599159240723, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8724558353424072, + "num_tokens": 780148119.0, + "step": 20452 + }, + { + "epoch": 2.6018318280117034, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6310197114944458, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.876945436000824, + "num_tokens": 780188921.0, + "step": 20453 + }, + { + "epoch": 2.601959038290294, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6144288778305054, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8683854341506958, + "num_tokens": 780229772.0, + "step": 20454 + }, + { + "epoch": 2.6020862485688845, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6251070499420166, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8824114203453064, + "num_tokens": 780267544.0, + "step": 20455 + }, + { + "epoch": 2.602213458847475, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.548205852508545, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8801977634429932, + "num_tokens": 780307748.0, + "step": 20456 + }, + { + "epoch": 2.6023406691260655, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6509490013122559, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8586252331733704, + "num_tokens": 780344621.0, + "step": 20457 + }, + { + "epoch": 2.6024678794046556, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5387259721755981, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8851690888404846, + "num_tokens": 780386542.0, + "step": 20458 + }, + { + "epoch": 2.6025950896832466, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5028518438339233, + "learning_rate": 1e-06, + "loss": 0.3363, + "mean_token_accuracy": 0.889336884021759, + "num_tokens": 780429388.0, + "step": 20459 + }, + { + "epoch": 2.6027222999618367, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6598037481307983, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.88490891456604, + "num_tokens": 780468854.0, + "step": 20460 + }, + { + "epoch": 2.6028495102404277, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.887014389038086, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8601433634757996, + "num_tokens": 780502033.0, + "step": 20461 + }, + { + "epoch": 2.6029767205190177, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.644596815109253, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8771892786026001, + "num_tokens": 780540711.0, + "step": 20462 + }, + { + "epoch": 2.6031039307976087, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.568217158317566, + "learning_rate": 1e-06, + "loss": 0.319, + "mean_token_accuracy": 0.8960031867027283, + "num_tokens": 780578459.0, + "step": 20463 + }, + { + "epoch": 2.603231141076199, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.8548965454101562, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8743726015090942, + "num_tokens": 780612446.0, + "step": 20464 + }, + { + "epoch": 2.6033583513547893, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7552003860473633, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.862305760383606, + "num_tokens": 780649050.0, + "step": 20465 + }, + { + "epoch": 2.60348556163338, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6713597774505615, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8809278011322021, + "num_tokens": 780685936.0, + "step": 20466 + }, + { + "epoch": 2.6036127719119704, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5229605436325073, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8745723962783813, + "num_tokens": 780728827.0, + "step": 20467 + }, + { + "epoch": 2.603739982190561, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6295075416564941, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.886551558971405, + "num_tokens": 780764181.0, + "step": 20468 + }, + { + "epoch": 2.6038671924691514, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5132540464401245, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8713692426681519, + "num_tokens": 780809408.0, + "step": 20469 + }, + { + "epoch": 2.603994402747742, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7330119609832764, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8812140226364136, + "num_tokens": 780849158.0, + "step": 20470 + }, + { + "epoch": 2.6041216130263325, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6521971225738525, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8773521184921265, + "num_tokens": 780889636.0, + "step": 20471 + }, + { + "epoch": 2.604248823304923, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7247380018234253, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8747901916503906, + "num_tokens": 780926112.0, + "step": 20472 + }, + { + "epoch": 2.6043760335835135, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7441924810409546, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8700608015060425, + "num_tokens": 780961622.0, + "step": 20473 + }, + { + "epoch": 2.604503243862104, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6741948127746582, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8557065725326538, + "num_tokens": 781003574.0, + "step": 20474 + }, + { + "epoch": 2.6046304541406946, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.9015727043151855, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8778303861618042, + "num_tokens": 781036027.0, + "step": 20475 + }, + { + "epoch": 2.604757664419285, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7478300333023071, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8697130680084229, + "num_tokens": 781076360.0, + "step": 20476 + }, + { + "epoch": 2.6048848746978757, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6408699750900269, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8758723139762878, + "num_tokens": 781116953.0, + "step": 20477 + }, + { + "epoch": 2.605012084976466, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.4980498552322388, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8877039551734924, + "num_tokens": 781159239.0, + "step": 20478 + }, + { + "epoch": 2.6051392952550567, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6443349123001099, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8752357959747314, + "num_tokens": 781200049.0, + "step": 20479 + }, + { + "epoch": 2.6052665055336472, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5914905071258545, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.88158118724823, + "num_tokens": 781242687.0, + "step": 20480 + }, + { + "epoch": 2.6053937158122378, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7225584983825684, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.871985912322998, + "num_tokens": 781277761.0, + "step": 20481 + }, + { + "epoch": 2.6055209260908283, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.8092024326324463, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8655949234962463, + "num_tokens": 781315606.0, + "step": 20482 + }, + { + "epoch": 2.6056481363694184, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 2.3007006645202637, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.880347728729248, + "num_tokens": 781348852.0, + "step": 20483 + }, + { + "epoch": 2.6057753466480094, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.561282753944397, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8799810409545898, + "num_tokens": 781385353.0, + "step": 20484 + }, + { + "epoch": 2.6059025569265994, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7218912839889526, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8787842988967896, + "num_tokens": 781420047.0, + "step": 20485 + }, + { + "epoch": 2.6060297672051904, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7027411460876465, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.887377142906189, + "num_tokens": 781457978.0, + "step": 20486 + }, + { + "epoch": 2.6061569774837805, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6036255359649658, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8701657652854919, + "num_tokens": 781498330.0, + "step": 20487 + }, + { + "epoch": 2.6062841877623715, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6618976593017578, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8835734724998474, + "num_tokens": 781533666.0, + "step": 20488 + }, + { + "epoch": 2.6064113980409616, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5234516859054565, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8842926025390625, + "num_tokens": 781572387.0, + "step": 20489 + }, + { + "epoch": 2.606538608319552, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.627631425857544, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8686571717262268, + "num_tokens": 781612453.0, + "step": 20490 + }, + { + "epoch": 2.6066658185981426, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.675284743309021, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8535009026527405, + "num_tokens": 781654328.0, + "step": 20491 + }, + { + "epoch": 2.606793028876733, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5480824708938599, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8737893104553223, + "num_tokens": 781698265.0, + "step": 20492 + }, + { + "epoch": 2.6069202391553237, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.9594889879226685, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.847592294216156, + "num_tokens": 781739918.0, + "step": 20493 + }, + { + "epoch": 2.607047449433914, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7388859987258911, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8747414946556091, + "num_tokens": 781775632.0, + "step": 20494 + }, + { + "epoch": 2.6071746597125047, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6611018180847168, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8848766088485718, + "num_tokens": 781814026.0, + "step": 20495 + }, + { + "epoch": 2.6073018699910953, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6385276317596436, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8761870265007019, + "num_tokens": 781853207.0, + "step": 20496 + }, + { + "epoch": 2.607429080269686, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.679334282875061, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8751085996627808, + "num_tokens": 781888262.0, + "step": 20497 + }, + { + "epoch": 2.6075562905482763, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.4697526693344116, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8852922916412354, + "num_tokens": 781933220.0, + "step": 20498 + }, + { + "epoch": 2.607683500826867, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7434946298599243, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8750379681587219, + "num_tokens": 781966091.0, + "step": 20499 + }, + { + "epoch": 2.6078107111054574, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5197362899780273, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8824083805084229, + "num_tokens": 782005532.0, + "step": 20500 + }, + { + "epoch": 2.607937921384048, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6522235870361328, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8689907193183899, + "num_tokens": 782041158.0, + "step": 20501 + }, + { + "epoch": 2.6080651316626384, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6417434215545654, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8776253461837769, + "num_tokens": 782076150.0, + "step": 20502 + }, + { + "epoch": 2.608192341941229, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6468888521194458, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.875596284866333, + "num_tokens": 782113327.0, + "step": 20503 + }, + { + "epoch": 2.6083195522198195, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6853584051132202, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8667056560516357, + "num_tokens": 782151679.0, + "step": 20504 + }, + { + "epoch": 2.60844676249841, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.7197659015655518, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8625443577766418, + "num_tokens": 782191326.0, + "step": 20505 + }, + { + "epoch": 2.6085739727770005, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.754738450050354, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8907679319381714, + "num_tokens": 782224130.0, + "step": 20506 + }, + { + "epoch": 2.608701183055591, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6764800548553467, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8752449750900269, + "num_tokens": 782260368.0, + "step": 20507 + }, + { + "epoch": 2.608828393334181, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5393635034561157, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8882391452789307, + "num_tokens": 782299088.0, + "step": 20508 + }, + { + "epoch": 2.608955603612772, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5858381986618042, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8829473257064819, + "num_tokens": 782339153.0, + "step": 20509 + }, + { + "epoch": 2.609082813891362, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6954199075698853, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8745101690292358, + "num_tokens": 782373823.0, + "step": 20510 + }, + { + "epoch": 2.609210024169953, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6119434833526611, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8708959817886353, + "num_tokens": 782412445.0, + "step": 20511 + }, + { + "epoch": 2.6093372344485433, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 2.212463140487671, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8886537551879883, + "num_tokens": 782449884.0, + "step": 20512 + }, + { + "epoch": 2.609464444727134, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7352880239486694, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8590995073318481, + "num_tokens": 782486479.0, + "step": 20513 + }, + { + "epoch": 2.6095916550057243, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6467924118041992, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8797220587730408, + "num_tokens": 782529254.0, + "step": 20514 + }, + { + "epoch": 2.609718865284315, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6787770986557007, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8817733526229858, + "num_tokens": 782563028.0, + "step": 20515 + }, + { + "epoch": 2.6098460755629054, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.510679841041565, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8728621602058411, + "num_tokens": 782609819.0, + "step": 20516 + }, + { + "epoch": 2.609973285841496, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6341025829315186, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8805236220359802, + "num_tokens": 782648141.0, + "step": 20517 + }, + { + "epoch": 2.6101004961200864, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6343770027160645, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8796241283416748, + "num_tokens": 782684697.0, + "step": 20518 + }, + { + "epoch": 2.610227706398677, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.611753225326538, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8747209310531616, + "num_tokens": 782723060.0, + "step": 20519 + }, + { + "epoch": 2.6103549166772675, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6704071760177612, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8595392107963562, + "num_tokens": 782762236.0, + "step": 20520 + }, + { + "epoch": 2.610482126955858, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5353325605392456, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8789597749710083, + "num_tokens": 782802603.0, + "step": 20521 + }, + { + "epoch": 2.6106093372344485, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5304327011108398, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8720863461494446, + "num_tokens": 782844697.0, + "step": 20522 + }, + { + "epoch": 2.610736547513039, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5598713159561157, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8749877214431763, + "num_tokens": 782885776.0, + "step": 20523 + }, + { + "epoch": 2.6108637577916296, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7824857234954834, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8761218190193176, + "num_tokens": 782916838.0, + "step": 20524 + }, + { + "epoch": 2.61099096807022, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.801409125328064, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8615070581436157, + "num_tokens": 782951904.0, + "step": 20525 + }, + { + "epoch": 2.6111181783488107, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6562360525131226, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.872961163520813, + "num_tokens": 782987064.0, + "step": 20526 + }, + { + "epoch": 2.611245388627401, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6074963808059692, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8623493909835815, + "num_tokens": 783029746.0, + "step": 20527 + }, + { + "epoch": 2.6113725989059917, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.64444899559021, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8726828098297119, + "num_tokens": 783071123.0, + "step": 20528 + }, + { + "epoch": 2.6114998091845822, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5716789960861206, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8787876963615417, + "num_tokens": 783112235.0, + "step": 20529 + }, + { + "epoch": 2.6116270194631728, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6833187341690063, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8677603006362915, + "num_tokens": 783152285.0, + "step": 20530 + }, + { + "epoch": 2.6117542297417633, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5439125299453735, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8784247040748596, + "num_tokens": 783190275.0, + "step": 20531 + }, + { + "epoch": 2.611881440020354, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6943410634994507, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8597337007522583, + "num_tokens": 783226794.0, + "step": 20532 + }, + { + "epoch": 2.612008650298944, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.740452766418457, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.870256245136261, + "num_tokens": 783261573.0, + "step": 20533 + }, + { + "epoch": 2.612135860577535, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7910761833190918, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8753131628036499, + "num_tokens": 783296402.0, + "step": 20534 + }, + { + "epoch": 2.612263070856125, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5841848850250244, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8768664598464966, + "num_tokens": 783339810.0, + "step": 20535 + }, + { + "epoch": 2.612390281134716, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7058751583099365, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8747716546058655, + "num_tokens": 783377772.0, + "step": 20536 + }, + { + "epoch": 2.612517491413306, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.596989393234253, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8733257055282593, + "num_tokens": 783422226.0, + "step": 20537 + }, + { + "epoch": 2.6126447016918966, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.592664122581482, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8849159479141235, + "num_tokens": 783464659.0, + "step": 20538 + }, + { + "epoch": 2.612771911970487, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.8488556146621704, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8701595067977905, + "num_tokens": 783497870.0, + "step": 20539 + }, + { + "epoch": 2.6128991222490776, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6823841333389282, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8706568479537964, + "num_tokens": 783536635.0, + "step": 20540 + }, + { + "epoch": 2.613026332527668, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6812504529953003, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8525412082672119, + "num_tokens": 783577122.0, + "step": 20541 + }, + { + "epoch": 2.6131535428062587, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6361967325210571, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8725858330726624, + "num_tokens": 783615942.0, + "step": 20542 + }, + { + "epoch": 2.613280753084849, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7647045850753784, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8674255609512329, + "num_tokens": 783651391.0, + "step": 20543 + }, + { + "epoch": 2.6134079633634397, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.587589979171753, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.8915221095085144, + "num_tokens": 783689435.0, + "step": 20544 + }, + { + "epoch": 2.6135351736420303, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5609502792358398, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8797479867935181, + "num_tokens": 783731111.0, + "step": 20545 + }, + { + "epoch": 2.613662383920621, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6221110820770264, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.870930016040802, + "num_tokens": 783770289.0, + "step": 20546 + }, + { + "epoch": 2.6137895941992113, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7329171895980835, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8675673007965088, + "num_tokens": 783809005.0, + "step": 20547 + }, + { + "epoch": 2.613916804477802, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6615034341812134, + "learning_rate": 1e-06, + "loss": 0.3447, + "mean_token_accuracy": 0.8856215476989746, + "num_tokens": 783848332.0, + "step": 20548 + }, + { + "epoch": 2.6140440147563924, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7395097017288208, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8823708295822144, + "num_tokens": 783884396.0, + "step": 20549 + }, + { + "epoch": 2.614171225034983, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7636277675628662, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8823378086090088, + "num_tokens": 783922061.0, + "step": 20550 + }, + { + "epoch": 2.6142984353135734, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7341469526290894, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.877181887626648, + "num_tokens": 783959911.0, + "step": 20551 + }, + { + "epoch": 2.614425645592164, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5900778770446777, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8671371936798096, + "num_tokens": 783999789.0, + "step": 20552 + }, + { + "epoch": 2.6145528558707545, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.8391191959381104, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8650707006454468, + "num_tokens": 784034022.0, + "step": 20553 + }, + { + "epoch": 2.614680066149345, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.628354787826538, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.883712649345398, + "num_tokens": 784069936.0, + "step": 20554 + }, + { + "epoch": 2.6148072764279355, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7295830249786377, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8747431039810181, + "num_tokens": 784107494.0, + "step": 20555 + }, + { + "epoch": 2.6149344867065256, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.735496163368225, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8935139179229736, + "num_tokens": 784140634.0, + "step": 20556 + }, + { + "epoch": 2.6150616969851166, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6268818378448486, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8831080198287964, + "num_tokens": 784182248.0, + "step": 20557 + }, + { + "epoch": 2.6151889072637067, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.4895061254501343, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8728775382041931, + "num_tokens": 784226822.0, + "step": 20558 + }, + { + "epoch": 2.6153161175422976, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5160343647003174, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8783893585205078, + "num_tokens": 784269731.0, + "step": 20559 + }, + { + "epoch": 2.6154433278208877, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.525458812713623, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8729852437973022, + "num_tokens": 784315732.0, + "step": 20560 + }, + { + "epoch": 2.6155705380994787, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5426005125045776, + "learning_rate": 1e-06, + "loss": 0.3291, + "mean_token_accuracy": 0.892930805683136, + "num_tokens": 784349127.0, + "step": 20561 + }, + { + "epoch": 2.615697748378069, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7147762775421143, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.874298632144928, + "num_tokens": 784385473.0, + "step": 20562 + }, + { + "epoch": 2.6158249586566593, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6976810693740845, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8749251961708069, + "num_tokens": 784424502.0, + "step": 20563 + }, + { + "epoch": 2.61595216893525, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7086291313171387, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8615518808364868, + "num_tokens": 784464884.0, + "step": 20564 + }, + { + "epoch": 2.6160793792138404, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5154162645339966, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.880320131778717, + "num_tokens": 784507843.0, + "step": 20565 + }, + { + "epoch": 2.616206589492431, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5216056108474731, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8842397928237915, + "num_tokens": 784551745.0, + "step": 20566 + }, + { + "epoch": 2.6163337997710214, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.639914631843567, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8765193223953247, + "num_tokens": 784592264.0, + "step": 20567 + }, + { + "epoch": 2.616461010049612, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.619073510169983, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8689540028572083, + "num_tokens": 784633848.0, + "step": 20568 + }, + { + "epoch": 2.6165882203282025, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7261196374893188, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8860818147659302, + "num_tokens": 784666516.0, + "step": 20569 + }, + { + "epoch": 2.616715430606793, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.595152497291565, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8865845799446106, + "num_tokens": 784703606.0, + "step": 20570 + }, + { + "epoch": 2.6168426408853835, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5607006549835205, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8827940821647644, + "num_tokens": 784741662.0, + "step": 20571 + }, + { + "epoch": 2.616969851163974, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5246765613555908, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8751067519187927, + "num_tokens": 784782722.0, + "step": 20572 + }, + { + "epoch": 2.6170970614425646, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.5843586921691895, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8876888751983643, + "num_tokens": 784819590.0, + "step": 20573 + }, + { + "epoch": 2.617224271721155, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6072067022323608, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8664023876190186, + "num_tokens": 784857575.0, + "step": 20574 + }, + { + "epoch": 2.6173514819997457, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.690981388092041, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8824379444122314, + "num_tokens": 784892558.0, + "step": 20575 + }, + { + "epoch": 2.617478692278336, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7228306531906128, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.871282160282135, + "num_tokens": 784931121.0, + "step": 20576 + }, + { + "epoch": 2.6176059025569267, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5634772777557373, + "learning_rate": 1e-06, + "loss": 0.3299, + "mean_token_accuracy": 0.89258873462677, + "num_tokens": 784969048.0, + "step": 20577 + }, + { + "epoch": 2.6177331128355172, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7002571821212769, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8638127446174622, + "num_tokens": 785010071.0, + "step": 20578 + }, + { + "epoch": 2.6178603231141078, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7742557525634766, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8701765537261963, + "num_tokens": 785046919.0, + "step": 20579 + }, + { + "epoch": 2.6179875333926983, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7786301374435425, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8800199031829834, + "num_tokens": 785079773.0, + "step": 20580 + }, + { + "epoch": 2.6181147436712884, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7084228992462158, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8777279853820801, + "num_tokens": 785117078.0, + "step": 20581 + }, + { + "epoch": 2.6182419539498794, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.591323971748352, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.865447998046875, + "num_tokens": 785157597.0, + "step": 20582 + }, + { + "epoch": 2.6183691642284694, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7152959108352661, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8722975850105286, + "num_tokens": 785189018.0, + "step": 20583 + }, + { + "epoch": 2.6184963745070604, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6691707372665405, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8875728845596313, + "num_tokens": 785224965.0, + "step": 20584 + }, + { + "epoch": 2.6186235847856505, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5259093046188354, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8844422698020935, + "num_tokens": 785266101.0, + "step": 20585 + }, + { + "epoch": 2.6187507950642415, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5773357152938843, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8853490948677063, + "num_tokens": 785303141.0, + "step": 20586 + }, + { + "epoch": 2.6188780053428315, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.676450490951538, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.886777400970459, + "num_tokens": 785340361.0, + "step": 20587 + }, + { + "epoch": 2.619005215621422, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6085281372070312, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.883959949016571, + "num_tokens": 785375872.0, + "step": 20588 + }, + { + "epoch": 2.6191324259000126, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.612734079360962, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8840165734291077, + "num_tokens": 785412335.0, + "step": 20589 + }, + { + "epoch": 2.619259636178603, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6003837585449219, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8819260001182556, + "num_tokens": 785450777.0, + "step": 20590 + }, + { + "epoch": 2.6193868464571937, + "ewc_loss": 2.86102294921875e-05, + "grad_norm": 1.6106613874435425, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.859970211982727, + "num_tokens": 785494065.0, + "step": 20591 + }, + { + "epoch": 2.619514056735784, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5581789016723633, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.871968150138855, + "num_tokens": 785538235.0, + "step": 20592 + }, + { + "epoch": 2.6196412670143747, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6272900104522705, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8638917207717896, + "num_tokens": 785576597.0, + "step": 20593 + }, + { + "epoch": 2.6197684772929652, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.538988471031189, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8835923671722412, + "num_tokens": 785619349.0, + "step": 20594 + }, + { + "epoch": 2.6198956875715558, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.797859787940979, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8620308637619019, + "num_tokens": 785654714.0, + "step": 20595 + }, + { + "epoch": 2.6200228978501463, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.469795823097229, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8820702433586121, + "num_tokens": 785701504.0, + "step": 20596 + }, + { + "epoch": 2.620150108128737, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7443710565567017, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8735553622245789, + "num_tokens": 785735380.0, + "step": 20597 + }, + { + "epoch": 2.6202773184073274, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6728652715682983, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.875414252281189, + "num_tokens": 785770947.0, + "step": 20598 + }, + { + "epoch": 2.620404528685918, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.492517352104187, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8842490315437317, + "num_tokens": 785812100.0, + "step": 20599 + }, + { + "epoch": 2.6205317389645084, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.800292730331421, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8801873922348022, + "num_tokens": 785845356.0, + "step": 20600 + }, + { + "epoch": 2.620658949243099, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5684597492218018, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.872977614402771, + "num_tokens": 785886626.0, + "step": 20601 + }, + { + "epoch": 2.6207861595216895, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5413141250610352, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8698530793190002, + "num_tokens": 785929918.0, + "step": 20602 + }, + { + "epoch": 2.62091336980028, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5885432958602905, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8759140968322754, + "num_tokens": 785969407.0, + "step": 20603 + }, + { + "epoch": 2.6210405800788705, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5418832302093506, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8903892636299133, + "num_tokens": 786008059.0, + "step": 20604 + }, + { + "epoch": 2.621167790357461, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.676271677017212, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8756395578384399, + "num_tokens": 786048148.0, + "step": 20605 + }, + { + "epoch": 2.621295000636051, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6634365320205688, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8755228519439697, + "num_tokens": 786086080.0, + "step": 20606 + }, + { + "epoch": 2.621422210914642, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6035521030426025, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8722244501113892, + "num_tokens": 786121622.0, + "step": 20607 + }, + { + "epoch": 2.621549421193232, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5998910665512085, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8817547559738159, + "num_tokens": 786158281.0, + "step": 20608 + }, + { + "epoch": 2.621676631471823, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6154826879501343, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8645731210708618, + "num_tokens": 786200310.0, + "step": 20609 + }, + { + "epoch": 2.6218038417504133, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.765384316444397, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8707278966903687, + "num_tokens": 786234340.0, + "step": 20610 + }, + { + "epoch": 2.621931052029004, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7276785373687744, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8664532899856567, + "num_tokens": 786271972.0, + "step": 20611 + }, + { + "epoch": 2.6220582623075943, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5379562377929688, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8838636875152588, + "num_tokens": 786312763.0, + "step": 20612 + }, + { + "epoch": 2.622185472586185, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6641765832901, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.874673068523407, + "num_tokens": 786351523.0, + "step": 20613 + }, + { + "epoch": 2.6223126828647754, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.625925898551941, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8719614744186401, + "num_tokens": 786387234.0, + "step": 20614 + }, + { + "epoch": 2.622439893143366, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7460448741912842, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8691476583480835, + "num_tokens": 786421649.0, + "step": 20615 + }, + { + "epoch": 2.6225671034219564, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5849156379699707, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8787186145782471, + "num_tokens": 786460023.0, + "step": 20616 + }, + { + "epoch": 2.622694313700547, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.8571584224700928, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8820813298225403, + "num_tokens": 786490414.0, + "step": 20617 + }, + { + "epoch": 2.6228215239791375, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5656921863555908, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.887428343296051, + "num_tokens": 786530734.0, + "step": 20618 + }, + { + "epoch": 2.622948734257728, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7834829092025757, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8596975803375244, + "num_tokens": 786569626.0, + "step": 20619 + }, + { + "epoch": 2.6230759445363185, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5424634218215942, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8811969757080078, + "num_tokens": 786608871.0, + "step": 20620 + }, + { + "epoch": 2.623203154814909, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8010191917419434, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8749054670333862, + "num_tokens": 786642665.0, + "step": 20621 + }, + { + "epoch": 2.6233303650934996, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.75786292552948, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8729215264320374, + "num_tokens": 786674786.0, + "step": 20622 + }, + { + "epoch": 2.62345757537209, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5266799926757812, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8853024244308472, + "num_tokens": 786717784.0, + "step": 20623 + }, + { + "epoch": 2.6235847856506807, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5619844198226929, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.872501015663147, + "num_tokens": 786758655.0, + "step": 20624 + }, + { + "epoch": 2.623711995929271, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6329162120819092, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8801116347312927, + "num_tokens": 786793114.0, + "step": 20625 + }, + { + "epoch": 2.6238392062078617, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.4969881772994995, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8787521123886108, + "num_tokens": 786838191.0, + "step": 20626 + }, + { + "epoch": 2.6239664164864522, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6363242864608765, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8809388279914856, + "num_tokens": 786879629.0, + "step": 20627 + }, + { + "epoch": 2.6240936267650428, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5421037673950195, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8809714317321777, + "num_tokens": 786922450.0, + "step": 20628 + }, + { + "epoch": 2.6242208370436333, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.586027979850769, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8806133270263672, + "num_tokens": 786960943.0, + "step": 20629 + }, + { + "epoch": 2.624348047322224, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6148301362991333, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8587683439254761, + "num_tokens": 787000289.0, + "step": 20630 + }, + { + "epoch": 2.624475257600814, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7499903440475464, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8645471334457397, + "num_tokens": 787036535.0, + "step": 20631 + }, + { + "epoch": 2.624602467879405, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5947149991989136, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8803220987319946, + "num_tokens": 787073822.0, + "step": 20632 + }, + { + "epoch": 2.624729678157995, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6042946577072144, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8780301809310913, + "num_tokens": 787114739.0, + "step": 20633 + }, + { + "epoch": 2.624856888436586, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6822055578231812, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8673335313796997, + "num_tokens": 787150828.0, + "step": 20634 + }, + { + "epoch": 2.624984098715176, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.756185531616211, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8754538893699646, + "num_tokens": 787186024.0, + "step": 20635 + }, + { + "epoch": 2.6251113089937665, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.571267008781433, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8739233613014221, + "num_tokens": 787224433.0, + "step": 20636 + }, + { + "epoch": 2.625238519272357, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.534536600112915, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8790906071662903, + "num_tokens": 787264642.0, + "step": 20637 + }, + { + "epoch": 2.6253657295509476, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5259881019592285, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8742011785507202, + "num_tokens": 787308051.0, + "step": 20638 + }, + { + "epoch": 2.625492939829538, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6213396787643433, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8642559051513672, + "num_tokens": 787347998.0, + "step": 20639 + }, + { + "epoch": 2.6256201501081287, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5538405179977417, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8752638101577759, + "num_tokens": 787389179.0, + "step": 20640 + }, + { + "epoch": 2.625747360386719, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6774145364761353, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8714084029197693, + "num_tokens": 787426472.0, + "step": 20641 + }, + { + "epoch": 2.6258745706653097, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7061280012130737, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8793082237243652, + "num_tokens": 787460081.0, + "step": 20642 + }, + { + "epoch": 2.6260017809439002, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5859577655792236, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8750669956207275, + "num_tokens": 787501088.0, + "step": 20643 + }, + { + "epoch": 2.6261289912224908, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5558966398239136, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8766078352928162, + "num_tokens": 787540073.0, + "step": 20644 + }, + { + "epoch": 2.6262562015010813, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5938202142715454, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8798558712005615, + "num_tokens": 787581658.0, + "step": 20645 + }, + { + "epoch": 2.626383411779672, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5772045850753784, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8649925589561462, + "num_tokens": 787624371.0, + "step": 20646 + }, + { + "epoch": 2.6265106220582624, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.8225936889648438, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8779705762863159, + "num_tokens": 787660782.0, + "step": 20647 + }, + { + "epoch": 2.626637832336853, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6050785779953003, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8826396465301514, + "num_tokens": 787701389.0, + "step": 20648 + }, + { + "epoch": 2.6267650426154434, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.4927432537078857, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8727883696556091, + "num_tokens": 787745522.0, + "step": 20649 + }, + { + "epoch": 2.626892252894034, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5137256383895874, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8771564960479736, + "num_tokens": 787786743.0, + "step": 20650 + }, + { + "epoch": 2.6270194631726245, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7045818567276, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8560725450515747, + "num_tokens": 787822748.0, + "step": 20651 + }, + { + "epoch": 2.627146673451215, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5864739418029785, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8806526064872742, + "num_tokens": 787862482.0, + "step": 20652 + }, + { + "epoch": 2.6272738837298055, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6542304754257202, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8798556327819824, + "num_tokens": 787896819.0, + "step": 20653 + }, + { + "epoch": 2.6274010940083956, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8105137348175049, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8654775023460388, + "num_tokens": 787932032.0, + "step": 20654 + }, + { + "epoch": 2.6275283042869866, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6565771102905273, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8894597887992859, + "num_tokens": 787968085.0, + "step": 20655 + }, + { + "epoch": 2.6276555145655767, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 16.821056365966797, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8878043293952942, + "num_tokens": 788005456.0, + "step": 20656 + }, + { + "epoch": 2.6277827248441676, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6404657363891602, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8882118463516235, + "num_tokens": 788042641.0, + "step": 20657 + }, + { + "epoch": 2.6279099351227577, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7111432552337646, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8780686259269714, + "num_tokens": 788079242.0, + "step": 20658 + }, + { + "epoch": 2.6280371454013487, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.9154201745986938, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8697311878204346, + "num_tokens": 788111585.0, + "step": 20659 + }, + { + "epoch": 2.628164355679939, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6598700284957886, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8743199110031128, + "num_tokens": 788149460.0, + "step": 20660 + }, + { + "epoch": 2.6282915659585293, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5519109964370728, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8890164494514465, + "num_tokens": 788189707.0, + "step": 20661 + }, + { + "epoch": 2.62841877623712, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7784873247146606, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8682006597518921, + "num_tokens": 788227811.0, + "step": 20662 + }, + { + "epoch": 2.6285459865157104, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7831271886825562, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8689867854118347, + "num_tokens": 788263407.0, + "step": 20663 + }, + { + "epoch": 2.628673196794301, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.8177216053009033, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.873297929763794, + "num_tokens": 788295903.0, + "step": 20664 + }, + { + "epoch": 2.6288004070728914, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.6873213052749634, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8637568950653076, + "num_tokens": 788333923.0, + "step": 20665 + }, + { + "epoch": 2.628927617351482, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.652047038078308, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8677759766578674, + "num_tokens": 788371156.0, + "step": 20666 + }, + { + "epoch": 2.6290548276300725, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6061028242111206, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8747121691703796, + "num_tokens": 788411698.0, + "step": 20667 + }, + { + "epoch": 2.629182037908663, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7658859491348267, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8762800693511963, + "num_tokens": 788451126.0, + "step": 20668 + }, + { + "epoch": 2.6293092481872535, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7509615421295166, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8729408979415894, + "num_tokens": 788483781.0, + "step": 20669 + }, + { + "epoch": 2.629436458465844, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.9686273336410522, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8665823936462402, + "num_tokens": 788517256.0, + "step": 20670 + }, + { + "epoch": 2.6295636687444346, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7555967569351196, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8608031272888184, + "num_tokens": 788551142.0, + "step": 20671 + }, + { + "epoch": 2.629690879023025, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6448529958724976, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8657476305961609, + "num_tokens": 788588200.0, + "step": 20672 + }, + { + "epoch": 2.6298180893016156, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7260743379592896, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8643091917037964, + "num_tokens": 788623158.0, + "step": 20673 + }, + { + "epoch": 2.629945299580206, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.8188133239746094, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8657327890396118, + "num_tokens": 788661715.0, + "step": 20674 + }, + { + "epoch": 2.6300725098587967, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5700864791870117, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8671890497207642, + "num_tokens": 788703285.0, + "step": 20675 + }, + { + "epoch": 2.6301997201373872, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5608989000320435, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8779222965240479, + "num_tokens": 788743803.0, + "step": 20676 + }, + { + "epoch": 2.6303269304159778, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5778311491012573, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8785731792449951, + "num_tokens": 788782351.0, + "step": 20677 + }, + { + "epoch": 2.6304541406945683, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6659233570098877, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8667047023773193, + "num_tokens": 788823389.0, + "step": 20678 + }, + { + "epoch": 2.6305813509731584, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6746773719787598, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8689150214195251, + "num_tokens": 788860263.0, + "step": 20679 + }, + { + "epoch": 2.6307085612517493, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.600420594215393, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8796389102935791, + "num_tokens": 788898898.0, + "step": 20680 + }, + { + "epoch": 2.6308357715303394, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7043694257736206, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8764550089836121, + "num_tokens": 788935992.0, + "step": 20681 + }, + { + "epoch": 2.6309629818089304, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.554417610168457, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.876773476600647, + "num_tokens": 788979259.0, + "step": 20682 + }, + { + "epoch": 2.6310901920875205, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7360514402389526, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8704426288604736, + "num_tokens": 789017770.0, + "step": 20683 + }, + { + "epoch": 2.6312174023661115, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5414406061172485, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.871048092842102, + "num_tokens": 789057120.0, + "step": 20684 + }, + { + "epoch": 2.6313446126447015, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6361421346664429, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8855003118515015, + "num_tokens": 789093060.0, + "step": 20685 + }, + { + "epoch": 2.631471822923292, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7847472429275513, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8677579164505005, + "num_tokens": 789126585.0, + "step": 20686 + }, + { + "epoch": 2.6315990332018826, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6741491556167603, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8807977437973022, + "num_tokens": 789165868.0, + "step": 20687 + }, + { + "epoch": 2.631726243480473, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.603249192237854, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8764495253562927, + "num_tokens": 789206082.0, + "step": 20688 + }, + { + "epoch": 2.6318534537590637, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5746886730194092, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8870429992675781, + "num_tokens": 789242656.0, + "step": 20689 + }, + { + "epoch": 2.631980664037654, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6898412704467773, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8771032094955444, + "num_tokens": 789280138.0, + "step": 20690 + }, + { + "epoch": 2.6321078743162447, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.592443585395813, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8852870464324951, + "num_tokens": 789322513.0, + "step": 20691 + }, + { + "epoch": 2.6322350845948352, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.648763656616211, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8745235204696655, + "num_tokens": 789359992.0, + "step": 20692 + }, + { + "epoch": 2.6323622948734258, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6009175777435303, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8734979629516602, + "num_tokens": 789400670.0, + "step": 20693 + }, + { + "epoch": 2.6324895051520163, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.582832932472229, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8713903427124023, + "num_tokens": 789449027.0, + "step": 20694 + }, + { + "epoch": 2.632616715430607, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5025551319122314, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8887847661972046, + "num_tokens": 789490711.0, + "step": 20695 + }, + { + "epoch": 2.6327439257091974, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7013437747955322, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8820051550865173, + "num_tokens": 789528083.0, + "step": 20696 + }, + { + "epoch": 2.632871135987788, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.578054666519165, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8836217522621155, + "num_tokens": 789570029.0, + "step": 20697 + }, + { + "epoch": 2.6329983462663784, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7512197494506836, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8798823356628418, + "num_tokens": 789605576.0, + "step": 20698 + }, + { + "epoch": 2.633125556544969, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7616976499557495, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8728983402252197, + "num_tokens": 789640143.0, + "step": 20699 + }, + { + "epoch": 2.6332527668235595, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5645557641983032, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8684085607528687, + "num_tokens": 789683672.0, + "step": 20700 + }, + { + "epoch": 2.63337997710215, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8141582012176514, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8747481107711792, + "num_tokens": 789716761.0, + "step": 20701 + }, + { + "epoch": 2.6335071873807405, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.71368408203125, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8743887543678284, + "num_tokens": 789751959.0, + "step": 20702 + }, + { + "epoch": 2.633634397659331, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5973881483078003, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.869788408279419, + "num_tokens": 789792202.0, + "step": 20703 + }, + { + "epoch": 2.633761607937921, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6051932573318481, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8684023022651672, + "num_tokens": 789832747.0, + "step": 20704 + }, + { + "epoch": 2.633888818216512, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7604900598526, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8629688620567322, + "num_tokens": 789866363.0, + "step": 20705 + }, + { + "epoch": 2.634016028495102, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.7251909971237183, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8515836000442505, + "num_tokens": 789902502.0, + "step": 20706 + }, + { + "epoch": 2.634143238773693, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6765446662902832, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8707489967346191, + "num_tokens": 789939958.0, + "step": 20707 + }, + { + "epoch": 2.6342704490522832, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7833523750305176, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8640707731246948, + "num_tokens": 789977198.0, + "step": 20708 + }, + { + "epoch": 2.6343976593308738, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.510392665863037, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8829483985900879, + "num_tokens": 790019892.0, + "step": 20709 + }, + { + "epoch": 2.6345248696094643, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5755221843719482, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8868664503097534, + "num_tokens": 790056344.0, + "step": 20710 + }, + { + "epoch": 2.634652079888055, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4952629804611206, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8878529071807861, + "num_tokens": 790096504.0, + "step": 20711 + }, + { + "epoch": 2.6347792901666454, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7218917608261108, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8671598434448242, + "num_tokens": 790131669.0, + "step": 20712 + }, + { + "epoch": 2.634906500445236, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5708764791488647, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.893450140953064, + "num_tokens": 790167250.0, + "step": 20713 + }, + { + "epoch": 2.6350337107238264, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5754481554031372, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8658837080001831, + "num_tokens": 790213100.0, + "step": 20714 + }, + { + "epoch": 2.635160921002417, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5499873161315918, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8807744383811951, + "num_tokens": 790253572.0, + "step": 20715 + }, + { + "epoch": 2.6352881312810075, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.646907925605774, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.878973126411438, + "num_tokens": 790287640.0, + "step": 20716 + }, + { + "epoch": 2.635415341559598, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6612728834152222, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8650685548782349, + "num_tokens": 790324460.0, + "step": 20717 + }, + { + "epoch": 2.6355425518381885, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.597589135169983, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8710939884185791, + "num_tokens": 790366020.0, + "step": 20718 + }, + { + "epoch": 2.635669762116779, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6303009986877441, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8676507472991943, + "num_tokens": 790405502.0, + "step": 20719 + }, + { + "epoch": 2.6357969723953696, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6663190126419067, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8663209080696106, + "num_tokens": 790441206.0, + "step": 20720 + }, + { + "epoch": 2.63592418267396, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6786456108093262, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8724610209465027, + "num_tokens": 790477447.0, + "step": 20721 + }, + { + "epoch": 2.6360513929525506, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7741973400115967, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8745056390762329, + "num_tokens": 790510448.0, + "step": 20722 + }, + { + "epoch": 2.636178603231141, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5658549070358276, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8835920691490173, + "num_tokens": 790548548.0, + "step": 20723 + }, + { + "epoch": 2.6363058135097317, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6740052700042725, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8646308183670044, + "num_tokens": 790584453.0, + "step": 20724 + }, + { + "epoch": 2.6364330237883222, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.793310523033142, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8672906160354614, + "num_tokens": 790622251.0, + "step": 20725 + }, + { + "epoch": 2.6365602340669128, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6582132577896118, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8818753361701965, + "num_tokens": 790657930.0, + "step": 20726 + }, + { + "epoch": 2.636687444345503, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7022762298583984, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.860812246799469, + "num_tokens": 790698289.0, + "step": 20727 + }, + { + "epoch": 2.636814654624094, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5656424760818481, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8746265769004822, + "num_tokens": 790742489.0, + "step": 20728 + }, + { + "epoch": 2.636941864902684, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5955452919006348, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8751678466796875, + "num_tokens": 790781506.0, + "step": 20729 + }, + { + "epoch": 2.637069075181275, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.75477135181427, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8727498054504395, + "num_tokens": 790817102.0, + "step": 20730 + }, + { + "epoch": 2.637196285459865, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6809879541397095, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8513889312744141, + "num_tokens": 790860092.0, + "step": 20731 + }, + { + "epoch": 2.637323495738456, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7501047849655151, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8715120553970337, + "num_tokens": 790896507.0, + "step": 20732 + }, + { + "epoch": 2.637450706017046, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6712499856948853, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8753378987312317, + "num_tokens": 790936238.0, + "step": 20733 + }, + { + "epoch": 2.6375779162956365, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6187846660614014, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8709240555763245, + "num_tokens": 790977330.0, + "step": 20734 + }, + { + "epoch": 2.637705126574227, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7631652355194092, + "learning_rate": 1e-06, + "loss": 0.3395, + "mean_token_accuracy": 0.8892653584480286, + "num_tokens": 791007858.0, + "step": 20735 + }, + { + "epoch": 2.6378323368528176, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6151472330093384, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8646138906478882, + "num_tokens": 791052074.0, + "step": 20736 + }, + { + "epoch": 2.637959547131408, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6271953582763672, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8815079927444458, + "num_tokens": 791093809.0, + "step": 20737 + }, + { + "epoch": 2.6380867574099987, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7226977348327637, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8711692690849304, + "num_tokens": 791130610.0, + "step": 20738 + }, + { + "epoch": 2.638213967688589, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.732973337173462, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.870447039604187, + "num_tokens": 791167682.0, + "step": 20739 + }, + { + "epoch": 2.6383411779671797, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.639427661895752, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8643098473548889, + "num_tokens": 791205329.0, + "step": 20740 + }, + { + "epoch": 2.6384683882457702, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5587611198425293, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8797107338905334, + "num_tokens": 791248393.0, + "step": 20741 + }, + { + "epoch": 2.6385955985243608, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5922725200653076, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8730546236038208, + "num_tokens": 791288862.0, + "step": 20742 + }, + { + "epoch": 2.6387228088029513, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4674562215805054, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8848037123680115, + "num_tokens": 791330031.0, + "step": 20743 + }, + { + "epoch": 2.638850019081542, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7065058946609497, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8825026750564575, + "num_tokens": 791363478.0, + "step": 20744 + }, + { + "epoch": 2.6389772293601323, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6013506650924683, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8712694644927979, + "num_tokens": 791406483.0, + "step": 20745 + }, + { + "epoch": 2.639104439638723, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.549286961555481, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8809988498687744, + "num_tokens": 791449857.0, + "step": 20746 + }, + { + "epoch": 2.6392316499173134, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6497856378555298, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8765286207199097, + "num_tokens": 791484948.0, + "step": 20747 + }, + { + "epoch": 2.639358860195904, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6093171834945679, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.871466338634491, + "num_tokens": 791525155.0, + "step": 20748 + }, + { + "epoch": 2.6394860704744945, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6844114065170288, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8816618323326111, + "num_tokens": 791560812.0, + "step": 20749 + }, + { + "epoch": 2.639613280753085, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5381613969802856, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8738791942596436, + "num_tokens": 791602773.0, + "step": 20750 + }, + { + "epoch": 2.6397404910316755, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6188585758209229, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8801770210266113, + "num_tokens": 791641234.0, + "step": 20751 + }, + { + "epoch": 2.6398677013102656, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5766242742538452, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8855721950531006, + "num_tokens": 791682041.0, + "step": 20752 + }, + { + "epoch": 2.6399949115888566, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5018235445022583, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8876021504402161, + "num_tokens": 791726127.0, + "step": 20753 + }, + { + "epoch": 2.6401221218674467, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7303094863891602, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.866117537021637, + "num_tokens": 791760569.0, + "step": 20754 + }, + { + "epoch": 2.6402493321460376, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5021456480026245, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8790978193283081, + "num_tokens": 791805559.0, + "step": 20755 + }, + { + "epoch": 2.6403765424246277, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7101746797561646, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8762388825416565, + "num_tokens": 791842566.0, + "step": 20756 + }, + { + "epoch": 2.6405037527032187, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7817250490188599, + "learning_rate": 1e-06, + "loss": 0.326, + "mean_token_accuracy": 0.8920010924339294, + "num_tokens": 791871837.0, + "step": 20757 + }, + { + "epoch": 2.6406309629818088, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6987969875335693, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8638828992843628, + "num_tokens": 791907538.0, + "step": 20758 + }, + { + "epoch": 2.6407581732603993, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.688141942024231, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8597767353057861, + "num_tokens": 791945907.0, + "step": 20759 + }, + { + "epoch": 2.64088538353899, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5072180032730103, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.882513701915741, + "num_tokens": 791985975.0, + "step": 20760 + }, + { + "epoch": 2.6410125938175804, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6397178173065186, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8740219473838806, + "num_tokens": 792022427.0, + "step": 20761 + }, + { + "epoch": 2.641139804096171, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5920490026474, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8795133829116821, + "num_tokens": 792061665.0, + "step": 20762 + }, + { + "epoch": 2.6412670143747614, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8071092367172241, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8766300082206726, + "num_tokens": 792093217.0, + "step": 20763 + }, + { + "epoch": 2.641394224653352, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.724234938621521, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8765953779220581, + "num_tokens": 792127604.0, + "step": 20764 + }, + { + "epoch": 2.6415214349319425, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6192950010299683, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8838472962379456, + "num_tokens": 792162276.0, + "step": 20765 + }, + { + "epoch": 2.641648645210533, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5569571256637573, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8756669163703918, + "num_tokens": 792203155.0, + "step": 20766 + }, + { + "epoch": 2.6417758554891235, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4701991081237793, + "learning_rate": 1e-06, + "loss": 0.3336, + "mean_token_accuracy": 0.8906310796737671, + "num_tokens": 792243543.0, + "step": 20767 + }, + { + "epoch": 2.641903065767714, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.723563313484192, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8727231025695801, + "num_tokens": 792277857.0, + "step": 20768 + }, + { + "epoch": 2.6420302760463046, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4881173372268677, + "learning_rate": 1e-06, + "loss": 0.3197, + "mean_token_accuracy": 0.8940214514732361, + "num_tokens": 792317246.0, + "step": 20769 + }, + { + "epoch": 2.642157486324895, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.906124234199524, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8613573908805847, + "num_tokens": 792352070.0, + "step": 20770 + }, + { + "epoch": 2.6422846966034856, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5215173959732056, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8902832865715027, + "num_tokens": 792389391.0, + "step": 20771 + }, + { + "epoch": 2.642411906882076, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.514496088027954, + "learning_rate": 1e-06, + "loss": 0.3456, + "mean_token_accuracy": 0.8895022869110107, + "num_tokens": 792430686.0, + "step": 20772 + }, + { + "epoch": 2.6425391171606667, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6088343858718872, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8731862902641296, + "num_tokens": 792467809.0, + "step": 20773 + }, + { + "epoch": 2.6426663274392572, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5442860126495361, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.871820330619812, + "num_tokens": 792509690.0, + "step": 20774 + }, + { + "epoch": 2.6427935377178478, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.47938871383667, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8756576180458069, + "num_tokens": 792555706.0, + "step": 20775 + }, + { + "epoch": 2.6429207479964383, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5617634057998657, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8717901110649109, + "num_tokens": 792599272.0, + "step": 20776 + }, + { + "epoch": 2.6430479582750284, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.614332914352417, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8729041814804077, + "num_tokens": 792637049.0, + "step": 20777 + }, + { + "epoch": 2.6431751685536193, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6654679775238037, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8741506338119507, + "num_tokens": 792671373.0, + "step": 20778 + }, + { + "epoch": 2.6433023788322094, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6701698303222656, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.867138147354126, + "num_tokens": 792712198.0, + "step": 20779 + }, + { + "epoch": 2.6434295891108004, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6417733430862427, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.859065055847168, + "num_tokens": 792751400.0, + "step": 20780 + }, + { + "epoch": 2.6435567993893905, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5207171440124512, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8673238754272461, + "num_tokens": 792797175.0, + "step": 20781 + }, + { + "epoch": 2.6436840096679814, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6667537689208984, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8779035210609436, + "num_tokens": 792835972.0, + "step": 20782 + }, + { + "epoch": 2.6438112199465715, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6763591766357422, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8723747730255127, + "num_tokens": 792873211.0, + "step": 20783 + }, + { + "epoch": 2.643938430225162, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6465405225753784, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8618040680885315, + "num_tokens": 792914680.0, + "step": 20784 + }, + { + "epoch": 2.6440656405037526, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6882517337799072, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8840337991714478, + "num_tokens": 792954431.0, + "step": 20785 + }, + { + "epoch": 2.644192850782343, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6127771139144897, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8760738372802734, + "num_tokens": 792992085.0, + "step": 20786 + }, + { + "epoch": 2.6443200610609336, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6710294485092163, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8582979440689087, + "num_tokens": 793032598.0, + "step": 20787 + }, + { + "epoch": 2.644447271339524, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8333256244659424, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8688173890113831, + "num_tokens": 793065060.0, + "step": 20788 + }, + { + "epoch": 2.6445744816181147, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5566405057907104, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8621611595153809, + "num_tokens": 793108654.0, + "step": 20789 + }, + { + "epoch": 2.6447016918967052, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6798344850540161, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8818275332450867, + "num_tokens": 793146015.0, + "step": 20790 + }, + { + "epoch": 2.6448289021752958, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.771842122077942, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8594775795936584, + "num_tokens": 793180684.0, + "step": 20791 + }, + { + "epoch": 2.6449561124538863, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6289118528366089, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8773272633552551, + "num_tokens": 793217204.0, + "step": 20792 + }, + { + "epoch": 2.645083322732477, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.837355613708496, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8656061291694641, + "num_tokens": 793255577.0, + "step": 20793 + }, + { + "epoch": 2.6452105330110673, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5462578535079956, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8827022314071655, + "num_tokens": 793294283.0, + "step": 20794 + }, + { + "epoch": 2.645337743289658, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6360435485839844, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8725119233131409, + "num_tokens": 793331187.0, + "step": 20795 + }, + { + "epoch": 2.6454649535682484, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6882859468460083, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.863653838634491, + "num_tokens": 793367675.0, + "step": 20796 + }, + { + "epoch": 2.645592163846839, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6476887464523315, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8570299744606018, + "num_tokens": 793407907.0, + "step": 20797 + }, + { + "epoch": 2.6457193741254295, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7047032117843628, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8720037341117859, + "num_tokens": 793442670.0, + "step": 20798 + }, + { + "epoch": 2.64584658440402, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5630555152893066, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8648024201393127, + "num_tokens": 793487658.0, + "step": 20799 + }, + { + "epoch": 2.6459737946826105, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8486055135726929, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8658367395401001, + "num_tokens": 793517761.0, + "step": 20800 + }, + { + "epoch": 2.646101004961201, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5254309177398682, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8716433048248291, + "num_tokens": 793559218.0, + "step": 20801 + }, + { + "epoch": 2.646228215239791, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.528125286102295, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8722751140594482, + "num_tokens": 793602003.0, + "step": 20802 + }, + { + "epoch": 2.646355425518382, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.745119571685791, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8751923441886902, + "num_tokens": 793634320.0, + "step": 20803 + }, + { + "epoch": 2.646482635796972, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.539860725402832, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.873112678527832, + "num_tokens": 793677056.0, + "step": 20804 + }, + { + "epoch": 2.646609846075563, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.687051773071289, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8749377727508545, + "num_tokens": 793710454.0, + "step": 20805 + }, + { + "epoch": 2.6467370563541532, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6274892091751099, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8861496448516846, + "num_tokens": 793744995.0, + "step": 20806 + }, + { + "epoch": 2.6468642666327438, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5869563817977905, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8756779432296753, + "num_tokens": 793782970.0, + "step": 20807 + }, + { + "epoch": 2.6469914769113343, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.560990333557129, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8769896030426025, + "num_tokens": 793824665.0, + "step": 20808 + }, + { + "epoch": 2.647118687189925, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8293442726135254, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8740776777267456, + "num_tokens": 793858505.0, + "step": 20809 + }, + { + "epoch": 2.6472458974685154, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.576680064201355, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8621475696563721, + "num_tokens": 793899166.0, + "step": 20810 + }, + { + "epoch": 2.647373107747106, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6845371723175049, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8766876459121704, + "num_tokens": 793933730.0, + "step": 20811 + }, + { + "epoch": 2.6475003180256964, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5766628980636597, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8695453405380249, + "num_tokens": 793974907.0, + "step": 20812 + }, + { + "epoch": 2.647627528304287, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5321263074874878, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8715872168540955, + "num_tokens": 794017975.0, + "step": 20813 + }, + { + "epoch": 2.6477547385828775, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6420670747756958, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.869966983795166, + "num_tokens": 794059852.0, + "step": 20814 + }, + { + "epoch": 2.647881948861468, + "ewc_loss": 2.8848648071289062e-05, + "grad_norm": 1.5528099536895752, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8731178045272827, + "num_tokens": 794100200.0, + "step": 20815 + }, + { + "epoch": 2.6480091591400585, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5087623596191406, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8812010288238525, + "num_tokens": 794143327.0, + "step": 20816 + }, + { + "epoch": 2.648136369418649, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.673406958580017, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8717504143714905, + "num_tokens": 794182605.0, + "step": 20817 + }, + { + "epoch": 2.6482635796972396, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6036665439605713, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8836181163787842, + "num_tokens": 794220044.0, + "step": 20818 + }, + { + "epoch": 2.64839078997583, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5344951152801514, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8659701347351074, + "num_tokens": 794266427.0, + "step": 20819 + }, + { + "epoch": 2.6485180002544206, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5082834959030151, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8790798187255859, + "num_tokens": 794308623.0, + "step": 20820 + }, + { + "epoch": 2.648645210533011, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.658679723739624, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8698651790618896, + "num_tokens": 794348908.0, + "step": 20821 + }, + { + "epoch": 2.6487724208116017, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5186505317687988, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8793158531188965, + "num_tokens": 794388495.0, + "step": 20822 + }, + { + "epoch": 2.648899631090192, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6691192388534546, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8761883974075317, + "num_tokens": 794422929.0, + "step": 20823 + }, + { + "epoch": 2.6490268413687827, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5113037824630737, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8717761039733887, + "num_tokens": 794464997.0, + "step": 20824 + }, + { + "epoch": 2.649154051647373, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.852104902267456, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8658552169799805, + "num_tokens": 794496237.0, + "step": 20825 + }, + { + "epoch": 2.649281261925964, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5130810737609863, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8828770518302917, + "num_tokens": 794540397.0, + "step": 20826 + }, + { + "epoch": 2.649408472204554, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.711268424987793, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8771110773086548, + "num_tokens": 794575030.0, + "step": 20827 + }, + { + "epoch": 2.649535682483145, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6926509141921997, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8702458739280701, + "num_tokens": 794612085.0, + "step": 20828 + }, + { + "epoch": 2.649662892761735, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7254116535186768, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8648480176925659, + "num_tokens": 794648461.0, + "step": 20829 + }, + { + "epoch": 2.649790103040326, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6117665767669678, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8783961534500122, + "num_tokens": 794687476.0, + "step": 20830 + }, + { + "epoch": 2.649917313318916, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5405912399291992, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.861276388168335, + "num_tokens": 794731446.0, + "step": 20831 + }, + { + "epoch": 2.6500445235975065, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.804497480392456, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8716641068458557, + "num_tokens": 794762012.0, + "step": 20832 + }, + { + "epoch": 2.650171733876097, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5793489217758179, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8831155896186829, + "num_tokens": 794799075.0, + "step": 20833 + }, + { + "epoch": 2.6502989441546876, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.721685528755188, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8729062080383301, + "num_tokens": 794832777.0, + "step": 20834 + }, + { + "epoch": 2.650426154433278, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6501461267471313, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.875738263130188, + "num_tokens": 794868274.0, + "step": 20835 + }, + { + "epoch": 2.6505533647118686, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6080436706542969, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8846964836120605, + "num_tokens": 794905329.0, + "step": 20836 + }, + { + "epoch": 2.650680574990459, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7279807329177856, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8664956092834473, + "num_tokens": 794941340.0, + "step": 20837 + }, + { + "epoch": 2.6508077852690497, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6077516078948975, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.877056360244751, + "num_tokens": 794980114.0, + "step": 20838 + }, + { + "epoch": 2.6509349955476402, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7654767036437988, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.865113377571106, + "num_tokens": 795016285.0, + "step": 20839 + }, + { + "epoch": 2.6510622058262308, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7432254552841187, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8668023943901062, + "num_tokens": 795052330.0, + "step": 20840 + }, + { + "epoch": 2.6511894161048213, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6892040967941284, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.869814395904541, + "num_tokens": 795092724.0, + "step": 20841 + }, + { + "epoch": 2.651316626383412, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6463143825531006, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8732589483261108, + "num_tokens": 795130298.0, + "step": 20842 + }, + { + "epoch": 2.6514438366620023, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5868806838989258, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8789530992507935, + "num_tokens": 795169053.0, + "step": 20843 + }, + { + "epoch": 2.651571046940593, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.549076795578003, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8705734610557556, + "num_tokens": 795210871.0, + "step": 20844 + }, + { + "epoch": 2.6516982572191834, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.555981993675232, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8667289018630981, + "num_tokens": 795251926.0, + "step": 20845 + }, + { + "epoch": 2.651825467497774, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4858123064041138, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8823347091674805, + "num_tokens": 795293258.0, + "step": 20846 + }, + { + "epoch": 2.6519526777763645, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5953245162963867, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8834042549133301, + "num_tokens": 795330965.0, + "step": 20847 + }, + { + "epoch": 2.652079888054955, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6231670379638672, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8719180226325989, + "num_tokens": 795372906.0, + "step": 20848 + }, + { + "epoch": 2.6522070983335455, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.777353286743164, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8751304149627686, + "num_tokens": 795408582.0, + "step": 20849 + }, + { + "epoch": 2.6523343086121356, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8383630514144897, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8725553154945374, + "num_tokens": 795441524.0, + "step": 20850 + }, + { + "epoch": 2.6524615188907266, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.522725224494934, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8581351637840271, + "num_tokens": 795487987.0, + "step": 20851 + }, + { + "epoch": 2.6525887291693167, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4979900121688843, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8621653914451599, + "num_tokens": 795534329.0, + "step": 20852 + }, + { + "epoch": 2.6527159394479076, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8486671447753906, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8706887364387512, + "num_tokens": 795567069.0, + "step": 20853 + }, + { + "epoch": 2.6528431497264977, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4605774879455566, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.888370931148529, + "num_tokens": 795610513.0, + "step": 20854 + }, + { + "epoch": 2.6529703600050887, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6741948127746582, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8840804696083069, + "num_tokens": 795645197.0, + "step": 20855 + }, + { + "epoch": 2.6530975702836788, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.540910243988037, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8782932758331299, + "num_tokens": 795684557.0, + "step": 20856 + }, + { + "epoch": 2.6532247805622693, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5465092658996582, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8764256238937378, + "num_tokens": 795725728.0, + "step": 20857 + }, + { + "epoch": 2.65335199084086, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6798306703567505, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.867252767086029, + "num_tokens": 795763169.0, + "step": 20858 + }, + { + "epoch": 2.6534792011194503, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.725122094154358, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8799942135810852, + "num_tokens": 795798943.0, + "step": 20859 + }, + { + "epoch": 2.653606411398041, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.611034631729126, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8784774541854858, + "num_tokens": 795836739.0, + "step": 20860 + }, + { + "epoch": 2.6537336216766314, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6373000144958496, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8660107254981995, + "num_tokens": 795878026.0, + "step": 20861 + }, + { + "epoch": 2.653860831955222, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6767747402191162, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8774560689926147, + "num_tokens": 795915431.0, + "step": 20862 + }, + { + "epoch": 2.6539880422338125, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6306489706039429, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8819462060928345, + "num_tokens": 795954088.0, + "step": 20863 + }, + { + "epoch": 2.654115252512403, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.58043372631073, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8665963411331177, + "num_tokens": 795995692.0, + "step": 20864 + }, + { + "epoch": 2.6542424627909935, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7340052127838135, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8690840601921082, + "num_tokens": 796032547.0, + "step": 20865 + }, + { + "epoch": 2.654369673069584, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6710054874420166, + "learning_rate": 1e-06, + "loss": 0.329, + "mean_token_accuracy": 0.893541693687439, + "num_tokens": 796063920.0, + "step": 20866 + }, + { + "epoch": 2.6544968833481746, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7068015336990356, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8704442977905273, + "num_tokens": 796100674.0, + "step": 20867 + }, + { + "epoch": 2.654624093626765, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.504759669303894, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8865947723388672, + "num_tokens": 796141766.0, + "step": 20868 + }, + { + "epoch": 2.6547513039053556, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5927594900131226, + "learning_rate": 1e-06, + "loss": 0.3361, + "mean_token_accuracy": 0.8951188325881958, + "num_tokens": 796180374.0, + "step": 20869 + }, + { + "epoch": 2.654878514183946, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7469804286956787, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8752719163894653, + "num_tokens": 796218388.0, + "step": 20870 + }, + { + "epoch": 2.6550057244625367, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6719952821731567, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8817166090011597, + "num_tokens": 796253199.0, + "step": 20871 + }, + { + "epoch": 2.655132934741127, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5965899229049683, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8837705850601196, + "num_tokens": 796292078.0, + "step": 20872 + }, + { + "epoch": 2.6552601450197177, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6592656373977661, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8656125068664551, + "num_tokens": 796331582.0, + "step": 20873 + }, + { + "epoch": 2.6553873552983083, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5702036619186401, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8938391804695129, + "num_tokens": 796368624.0, + "step": 20874 + }, + { + "epoch": 2.6555145655768984, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6551343202590942, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8749539852142334, + "num_tokens": 796408885.0, + "step": 20875 + }, + { + "epoch": 2.6556417758554893, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.678160309791565, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8676825165748596, + "num_tokens": 796444751.0, + "step": 20876 + }, + { + "epoch": 2.6557689861340794, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.67903470993042, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8806791305541992, + "num_tokens": 796482118.0, + "step": 20877 + }, + { + "epoch": 2.6558961964126704, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5920066833496094, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8885370492935181, + "num_tokens": 796517886.0, + "step": 20878 + }, + { + "epoch": 2.6560234066912605, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5533044338226318, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8842343091964722, + "num_tokens": 796557093.0, + "step": 20879 + }, + { + "epoch": 2.6561506169698514, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6845486164093018, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.881077229976654, + "num_tokens": 796595041.0, + "step": 20880 + }, + { + "epoch": 2.6562778272484415, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.534618854522705, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8773349523544312, + "num_tokens": 796637163.0, + "step": 20881 + }, + { + "epoch": 2.656405037527032, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.601378083229065, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8780279159545898, + "num_tokens": 796674978.0, + "step": 20882 + }, + { + "epoch": 2.6565322478056226, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5690001249313354, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8889114856719971, + "num_tokens": 796713604.0, + "step": 20883 + }, + { + "epoch": 2.656659458084213, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5857089757919312, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8728530406951904, + "num_tokens": 796754974.0, + "step": 20884 + }, + { + "epoch": 2.6567866683628036, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5411298274993896, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8895841836929321, + "num_tokens": 796792210.0, + "step": 20885 + }, + { + "epoch": 2.656913878641394, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6332865953445435, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8815670013427734, + "num_tokens": 796830670.0, + "step": 20886 + }, + { + "epoch": 2.6570410889199847, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7448811531066895, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8675865530967712, + "num_tokens": 796869724.0, + "step": 20887 + }, + { + "epoch": 2.6571682991985752, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.520974040031433, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8823091387748718, + "num_tokens": 796912356.0, + "step": 20888 + }, + { + "epoch": 2.6572955094771658, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6761342287063599, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8773048520088196, + "num_tokens": 796946922.0, + "step": 20889 + }, + { + "epoch": 2.6574227197557563, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5533908605575562, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8704140782356262, + "num_tokens": 796990881.0, + "step": 20890 + }, + { + "epoch": 2.657549930034347, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.768522024154663, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8736240863800049, + "num_tokens": 797027629.0, + "step": 20891 + }, + { + "epoch": 2.6576771403129373, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6460391283035278, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8813745975494385, + "num_tokens": 797061695.0, + "step": 20892 + }, + { + "epoch": 2.657804350591528, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6949403285980225, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8830241560935974, + "num_tokens": 797097898.0, + "step": 20893 + }, + { + "epoch": 2.6579315608701184, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8034887313842773, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8638724088668823, + "num_tokens": 797133790.0, + "step": 20894 + }, + { + "epoch": 2.658058771148709, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7654112577438354, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8716433048248291, + "num_tokens": 797166931.0, + "step": 20895 + }, + { + "epoch": 2.6581859814272994, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6749722957611084, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8639904856681824, + "num_tokens": 797207302.0, + "step": 20896 + }, + { + "epoch": 2.65831319170589, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7326923608779907, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8695146441459656, + "num_tokens": 797238763.0, + "step": 20897 + }, + { + "epoch": 2.6584404019844805, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.621567726135254, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8739787340164185, + "num_tokens": 797275530.0, + "step": 20898 + }, + { + "epoch": 2.658567612263071, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.641225814819336, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.864942193031311, + "num_tokens": 797312658.0, + "step": 20899 + }, + { + "epoch": 2.658694822541661, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.661797046661377, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8762762546539307, + "num_tokens": 797350650.0, + "step": 20900 + }, + { + "epoch": 2.658822032820252, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4951592683792114, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.8936899900436401, + "num_tokens": 797390001.0, + "step": 20901 + }, + { + "epoch": 2.658949243098842, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6357121467590332, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8649215698242188, + "num_tokens": 797430030.0, + "step": 20902 + }, + { + "epoch": 2.659076453377433, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.610823392868042, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8663424849510193, + "num_tokens": 797469304.0, + "step": 20903 + }, + { + "epoch": 2.6592036636560232, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6318588256835938, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8785362243652344, + "num_tokens": 797506503.0, + "step": 20904 + }, + { + "epoch": 2.6593308739346138, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.722861647605896, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8730515837669373, + "num_tokens": 797543854.0, + "step": 20905 + }, + { + "epoch": 2.6594580842132043, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6067018508911133, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.857723593711853, + "num_tokens": 797586515.0, + "step": 20906 + }, + { + "epoch": 2.659585294491795, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.750349760055542, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8663069605827332, + "num_tokens": 797621407.0, + "step": 20907 + }, + { + "epoch": 2.6597125047703853, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7715167999267578, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8579951524734497, + "num_tokens": 797655456.0, + "step": 20908 + }, + { + "epoch": 2.659839715048976, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6591219902038574, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8661620616912842, + "num_tokens": 797694566.0, + "step": 20909 + }, + { + "epoch": 2.6599669253275664, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7449681758880615, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8554717302322388, + "num_tokens": 797733378.0, + "step": 20910 + }, + { + "epoch": 2.660094135606157, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6148096323013306, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8611854314804077, + "num_tokens": 797773430.0, + "step": 20911 + }, + { + "epoch": 2.6602213458847475, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5887373685836792, + "learning_rate": 1e-06, + "loss": 0.3402, + "mean_token_accuracy": 0.8905377388000488, + "num_tokens": 797811491.0, + "step": 20912 + }, + { + "epoch": 2.660348556163338, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5923197269439697, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8886589407920837, + "num_tokens": 797847629.0, + "step": 20913 + }, + { + "epoch": 2.6604757664419285, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.494498372077942, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8704761266708374, + "num_tokens": 797890274.0, + "step": 20914 + }, + { + "epoch": 2.660602976720519, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6243071556091309, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8708591461181641, + "num_tokens": 797927899.0, + "step": 20915 + }, + { + "epoch": 2.6607301869991096, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5905884504318237, + "learning_rate": 1e-06, + "loss": 0.3354, + "mean_token_accuracy": 0.8909311890602112, + "num_tokens": 797964979.0, + "step": 20916 + }, + { + "epoch": 2.6608573972777, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8044081926345825, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8666512966156006, + "num_tokens": 797997438.0, + "step": 20917 + }, + { + "epoch": 2.6609846075562906, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6779857873916626, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8739825487136841, + "num_tokens": 798035282.0, + "step": 20918 + }, + { + "epoch": 2.661111817834881, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6516170501708984, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8726662397384644, + "num_tokens": 798074530.0, + "step": 20919 + }, + { + "epoch": 2.6612390281134717, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.700799584388733, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8672583699226379, + "num_tokens": 798114328.0, + "step": 20920 + }, + { + "epoch": 2.661366238392062, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7534749507904053, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8746328353881836, + "num_tokens": 798149152.0, + "step": 20921 + }, + { + "epoch": 2.6614934486706527, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5912833213806152, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8729298114776611, + "num_tokens": 798187247.0, + "step": 20922 + }, + { + "epoch": 2.661620658949243, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6526259183883667, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8690931797027588, + "num_tokens": 798224377.0, + "step": 20923 + }, + { + "epoch": 2.661747869227834, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6677130460739136, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.853659987449646, + "num_tokens": 798265669.0, + "step": 20924 + }, + { + "epoch": 2.661875079506424, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.616674780845642, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8724073171615601, + "num_tokens": 798304732.0, + "step": 20925 + }, + { + "epoch": 2.662002289785015, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8151202201843262, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8776845932006836, + "num_tokens": 798339599.0, + "step": 20926 + }, + { + "epoch": 2.662129500063605, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7446571588516235, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8529484272003174, + "num_tokens": 798377764.0, + "step": 20927 + }, + { + "epoch": 2.662256710342196, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5505847930908203, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8786123991012573, + "num_tokens": 798417330.0, + "step": 20928 + }, + { + "epoch": 2.662383920620786, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.713085651397705, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8792625665664673, + "num_tokens": 798453951.0, + "step": 20929 + }, + { + "epoch": 2.6625111308993765, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6088783740997314, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8733416199684143, + "num_tokens": 798495985.0, + "step": 20930 + }, + { + "epoch": 2.662638341177967, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6123251914978027, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8859491348266602, + "num_tokens": 798531598.0, + "step": 20931 + }, + { + "epoch": 2.6627655514565576, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7919094562530518, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8734833002090454, + "num_tokens": 798568243.0, + "step": 20932 + }, + { + "epoch": 2.662892761735148, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6993272304534912, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8734962940216064, + "num_tokens": 798604659.0, + "step": 20933 + }, + { + "epoch": 2.6630199720137386, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7963629961013794, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8637806177139282, + "num_tokens": 798641765.0, + "step": 20934 + }, + { + "epoch": 2.663147182292329, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5120649337768555, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8851161599159241, + "num_tokens": 798681055.0, + "step": 20935 + }, + { + "epoch": 2.6632743925709197, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6286088228225708, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8639958500862122, + "num_tokens": 798723050.0, + "step": 20936 + }, + { + "epoch": 2.66340160284951, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.656772494316101, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8723093867301941, + "num_tokens": 798766064.0, + "step": 20937 + }, + { + "epoch": 2.6635288131281007, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6585801839828491, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8727927207946777, + "num_tokens": 798807070.0, + "step": 20938 + }, + { + "epoch": 2.6636560234066913, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6490620374679565, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8719547986984253, + "num_tokens": 798847611.0, + "step": 20939 + }, + { + "epoch": 2.663783233685282, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6707351207733154, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8539636135101318, + "num_tokens": 798888172.0, + "step": 20940 + }, + { + "epoch": 2.6639104439638723, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.612755537033081, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8797526359558105, + "num_tokens": 798924998.0, + "step": 20941 + }, + { + "epoch": 2.664037654242463, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4979767799377441, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8814474940299988, + "num_tokens": 798967346.0, + "step": 20942 + }, + { + "epoch": 2.6641648645210534, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.588005542755127, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8754340410232544, + "num_tokens": 799009374.0, + "step": 20943 + }, + { + "epoch": 2.664292074799644, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6449880599975586, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8804199695587158, + "num_tokens": 799044028.0, + "step": 20944 + }, + { + "epoch": 2.6644192850782344, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7842236757278442, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.856239914894104, + "num_tokens": 799084658.0, + "step": 20945 + }, + { + "epoch": 2.664546495356825, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7549209594726562, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8732988834381104, + "num_tokens": 799119436.0, + "step": 20946 + }, + { + "epoch": 2.6646737056354155, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6200034618377686, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8767996430397034, + "num_tokens": 799160466.0, + "step": 20947 + }, + { + "epoch": 2.6648009159140056, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.487849235534668, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8833870887756348, + "num_tokens": 799201382.0, + "step": 20948 + }, + { + "epoch": 2.6649281261925966, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8309903144836426, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8461105227470398, + "num_tokens": 799236515.0, + "step": 20949 + }, + { + "epoch": 2.6650553364711866, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.541435956954956, + "learning_rate": 1e-06, + "loss": 0.3448, + "mean_token_accuracy": 0.8890775442123413, + "num_tokens": 799278327.0, + "step": 20950 + }, + { + "epoch": 2.6651825467497776, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5328682661056519, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.866442084312439, + "num_tokens": 799321858.0, + "step": 20951 + }, + { + "epoch": 2.6653097570283677, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.820860505104065, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8750162124633789, + "num_tokens": 799357239.0, + "step": 20952 + }, + { + "epoch": 2.6654369673069587, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7119029760360718, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8666821718215942, + "num_tokens": 799395680.0, + "step": 20953 + }, + { + "epoch": 2.6655641775855488, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7050068378448486, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8779336214065552, + "num_tokens": 799432862.0, + "step": 20954 + }, + { + "epoch": 2.6656913878641393, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6590574979782104, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8594104051589966, + "num_tokens": 799471826.0, + "step": 20955 + }, + { + "epoch": 2.66581859814273, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7364176511764526, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8906561136245728, + "num_tokens": 799501513.0, + "step": 20956 + }, + { + "epoch": 2.6659458084213203, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8848271369934082, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8625891208648682, + "num_tokens": 799532822.0, + "step": 20957 + }, + { + "epoch": 2.666073018699911, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7631863355636597, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8720682859420776, + "num_tokens": 799569578.0, + "step": 20958 + }, + { + "epoch": 2.6662002289785014, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8985322713851929, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8553221225738525, + "num_tokens": 799603342.0, + "step": 20959 + }, + { + "epoch": 2.666327439257092, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5787005424499512, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8734201788902283, + "num_tokens": 799645117.0, + "step": 20960 + }, + { + "epoch": 2.6664546495356825, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7710587978363037, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8663436770439148, + "num_tokens": 799679550.0, + "step": 20961 + }, + { + "epoch": 2.666581859814273, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.765015959739685, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8729708194732666, + "num_tokens": 799716107.0, + "step": 20962 + }, + { + "epoch": 2.6667090700928635, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5709083080291748, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8813760280609131, + "num_tokens": 799757045.0, + "step": 20963 + }, + { + "epoch": 2.666836280371454, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5721194744110107, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8815622329711914, + "num_tokens": 799797543.0, + "step": 20964 + }, + { + "epoch": 2.6669634906500446, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6686965227127075, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.870198130607605, + "num_tokens": 799838456.0, + "step": 20965 + }, + { + "epoch": 2.667090700928635, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6760478019714355, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8782342672348022, + "num_tokens": 799878968.0, + "step": 20966 + }, + { + "epoch": 2.6672179112072256, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4785372018814087, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8849177956581116, + "num_tokens": 799920792.0, + "step": 20967 + }, + { + "epoch": 2.667345121485816, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.565685510635376, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8674566745758057, + "num_tokens": 799963298.0, + "step": 20968 + }, + { + "epoch": 2.6674723317644067, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6506446599960327, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8762307167053223, + "num_tokens": 800000115.0, + "step": 20969 + }, + { + "epoch": 2.667599542042997, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.695042610168457, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8819872140884399, + "num_tokens": 800038101.0, + "step": 20970 + }, + { + "epoch": 2.6677267523215877, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7029951810836792, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8830372095108032, + "num_tokens": 800075446.0, + "step": 20971 + }, + { + "epoch": 2.6678539626001783, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5954474210739136, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8780739307403564, + "num_tokens": 800115445.0, + "step": 20972 + }, + { + "epoch": 2.6679811728787683, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5936744213104248, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8795591592788696, + "num_tokens": 800155461.0, + "step": 20973 + }, + { + "epoch": 2.6681083831573593, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7114344835281372, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8838236331939697, + "num_tokens": 800195603.0, + "step": 20974 + }, + { + "epoch": 2.6682355934359494, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5925341844558716, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8865056037902832, + "num_tokens": 800232138.0, + "step": 20975 + }, + { + "epoch": 2.6683628037145404, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7669428586959839, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.878991961479187, + "num_tokens": 800268378.0, + "step": 20976 + }, + { + "epoch": 2.6684900139931305, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.749566674232483, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8672879934310913, + "num_tokens": 800302341.0, + "step": 20977 + }, + { + "epoch": 2.668617224271721, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.690025806427002, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8646224141120911, + "num_tokens": 800341028.0, + "step": 20978 + }, + { + "epoch": 2.6687444345503115, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6602991819381714, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8604487180709839, + "num_tokens": 800383043.0, + "step": 20979 + }, + { + "epoch": 2.668871644828902, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5643813610076904, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8877184391021729, + "num_tokens": 800423417.0, + "step": 20980 + }, + { + "epoch": 2.6689988551074926, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5547537803649902, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8768402934074402, + "num_tokens": 800460168.0, + "step": 20981 + }, + { + "epoch": 2.669126065386083, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6109579801559448, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8719892501831055, + "num_tokens": 800498940.0, + "step": 20982 + }, + { + "epoch": 2.6692532756646736, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.637568473815918, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8775275945663452, + "num_tokens": 800534309.0, + "step": 20983 + }, + { + "epoch": 2.669380485943264, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6190582513809204, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8795551657676697, + "num_tokens": 800571280.0, + "step": 20984 + }, + { + "epoch": 2.6695076962218547, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5843303203582764, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.87043297290802, + "num_tokens": 800610598.0, + "step": 20985 + }, + { + "epoch": 2.669634906500445, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 4.639649391174316, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8664909601211548, + "num_tokens": 800650252.0, + "step": 20986 + }, + { + "epoch": 2.6697621167790357, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7227139472961426, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8772844672203064, + "num_tokens": 800685324.0, + "step": 20987 + }, + { + "epoch": 2.6698893270576263, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5948113203048706, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8661788702011108, + "num_tokens": 800729456.0, + "step": 20988 + }, + { + "epoch": 2.670016537336217, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7367372512817383, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8652124404907227, + "num_tokens": 800765727.0, + "step": 20989 + }, + { + "epoch": 2.6701437476148073, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6404200792312622, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8595044016838074, + "num_tokens": 800806079.0, + "step": 20990 + }, + { + "epoch": 2.670270957893398, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.649424433708191, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.861659586429596, + "num_tokens": 800846636.0, + "step": 20991 + }, + { + "epoch": 2.6703981681719884, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6763970851898193, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8831624388694763, + "num_tokens": 800885951.0, + "step": 20992 + }, + { + "epoch": 2.670525378450579, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6541728973388672, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8782645463943481, + "num_tokens": 800923733.0, + "step": 20993 + }, + { + "epoch": 2.6706525887291694, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.689807653427124, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8770058751106262, + "num_tokens": 800958952.0, + "step": 20994 + }, + { + "epoch": 2.67077979900776, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6528836488723755, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8830645084381104, + "num_tokens": 800995287.0, + "step": 20995 + }, + { + "epoch": 2.6709070092863505, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5787256956100464, + "learning_rate": 1e-06, + "loss": 0.3409, + "mean_token_accuracy": 0.8879753947257996, + "num_tokens": 801030670.0, + "step": 20996 + }, + { + "epoch": 2.671034219564941, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.545760989189148, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8740899562835693, + "num_tokens": 801072455.0, + "step": 20997 + }, + { + "epoch": 2.671161429843531, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5852659940719604, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8784510493278503, + "num_tokens": 801109119.0, + "step": 20998 + }, + { + "epoch": 2.671288640122122, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7926145792007446, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8681943416595459, + "num_tokens": 801141349.0, + "step": 20999 + }, + { + "epoch": 2.671415850400712, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8501473665237427, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8800151944160461, + "num_tokens": 801184395.0, + "step": 21000 + }, + { + "epoch": 2.671543060679303, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.60802161693573, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8701868653297424, + "num_tokens": 801224079.0, + "step": 21001 + }, + { + "epoch": 2.6716702709578932, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6306796073913574, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8741214275360107, + "num_tokens": 801259193.0, + "step": 21002 + }, + { + "epoch": 2.6717974812364838, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7547346353530884, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8636565804481506, + "num_tokens": 801295861.0, + "step": 21003 + }, + { + "epoch": 2.6719246915150743, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6986020803451538, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8727978467941284, + "num_tokens": 801330365.0, + "step": 21004 + }, + { + "epoch": 2.672051901793665, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.532554268836975, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8673096895217896, + "num_tokens": 801372218.0, + "step": 21005 + }, + { + "epoch": 2.6721791120722553, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5346896648406982, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8857243061065674, + "num_tokens": 801413002.0, + "step": 21006 + }, + { + "epoch": 2.672306322350846, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5732626914978027, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8604648113250732, + "num_tokens": 801454530.0, + "step": 21007 + }, + { + "epoch": 2.6724335326294364, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.720193862915039, + "learning_rate": 1e-06, + "loss": 0.3473, + "mean_token_accuracy": 0.8875659704208374, + "num_tokens": 801491029.0, + "step": 21008 + }, + { + "epoch": 2.672560742908027, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4650321006774902, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8752561807632446, + "num_tokens": 801537908.0, + "step": 21009 + }, + { + "epoch": 2.6726879531866174, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.734890103340149, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8560237884521484, + "num_tokens": 801575349.0, + "step": 21010 + }, + { + "epoch": 2.672815163465208, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6593751907348633, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8652817010879517, + "num_tokens": 801609327.0, + "step": 21011 + }, + { + "epoch": 2.6729423737437985, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.544013500213623, + "learning_rate": 1e-06, + "loss": 0.3206, + "mean_token_accuracy": 0.8975833058357239, + "num_tokens": 801646729.0, + "step": 21012 + }, + { + "epoch": 2.673069584022389, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.685981035232544, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.865373969078064, + "num_tokens": 801683899.0, + "step": 21013 + }, + { + "epoch": 2.6731967943009796, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6710822582244873, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8739038705825806, + "num_tokens": 801720186.0, + "step": 21014 + }, + { + "epoch": 2.67332400457957, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.682796597480774, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8855386972427368, + "num_tokens": 801754564.0, + "step": 21015 + }, + { + "epoch": 2.6734512148581606, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6944104433059692, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8732593059539795, + "num_tokens": 801792433.0, + "step": 21016 + }, + { + "epoch": 2.673578425136751, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6234030723571777, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8701245188713074, + "num_tokens": 801833254.0, + "step": 21017 + }, + { + "epoch": 2.6737056354153417, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6959234476089478, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8816660642623901, + "num_tokens": 801871163.0, + "step": 21018 + }, + { + "epoch": 2.673832845693932, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7504295110702515, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8789157867431641, + "num_tokens": 801904740.0, + "step": 21019 + }, + { + "epoch": 2.6739600559725227, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5400629043579102, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8803811073303223, + "num_tokens": 801946270.0, + "step": 21020 + }, + { + "epoch": 2.674087266251113, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7014403343200684, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8709273934364319, + "num_tokens": 801981358.0, + "step": 21021 + }, + { + "epoch": 2.674214476529704, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7805304527282715, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8792546987533569, + "num_tokens": 802013817.0, + "step": 21022 + }, + { + "epoch": 2.674341686808294, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6229404211044312, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8814241290092468, + "num_tokens": 802051816.0, + "step": 21023 + }, + { + "epoch": 2.674468897086885, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.576348900794983, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8678198456764221, + "num_tokens": 802091116.0, + "step": 21024 + }, + { + "epoch": 2.674596107365475, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6236672401428223, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.866643488407135, + "num_tokens": 802129238.0, + "step": 21025 + }, + { + "epoch": 2.674723317644066, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5163413286209106, + "learning_rate": 1e-06, + "loss": 0.315, + "mean_token_accuracy": 0.8973082304000854, + "num_tokens": 802167030.0, + "step": 21026 + }, + { + "epoch": 2.674850527922656, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6041738986968994, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8837091326713562, + "num_tokens": 802204405.0, + "step": 21027 + }, + { + "epoch": 2.6749777382012465, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.886451005935669, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8717098236083984, + "num_tokens": 802236785.0, + "step": 21028 + }, + { + "epoch": 2.675104948479837, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8320516347885132, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8741058111190796, + "num_tokens": 802269981.0, + "step": 21029 + }, + { + "epoch": 2.6752321587584276, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5950802564620972, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8881349563598633, + "num_tokens": 802306980.0, + "step": 21030 + }, + { + "epoch": 2.675359369037018, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5049816370010376, + "learning_rate": 1e-06, + "loss": 0.3178, + "mean_token_accuracy": 0.8972647190093994, + "num_tokens": 802349504.0, + "step": 21031 + }, + { + "epoch": 2.6754865793156086, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7426728010177612, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8890373110771179, + "num_tokens": 802383992.0, + "step": 21032 + }, + { + "epoch": 2.675613789594199, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5481188297271729, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8803758025169373, + "num_tokens": 802427048.0, + "step": 21033 + }, + { + "epoch": 2.6757409998727897, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8351974487304688, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8672379851341248, + "num_tokens": 802462942.0, + "step": 21034 + }, + { + "epoch": 2.67586821015138, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4147357940673828, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8882945775985718, + "num_tokens": 802507713.0, + "step": 21035 + }, + { + "epoch": 2.6759954204299707, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7960666418075562, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8634449243545532, + "num_tokens": 802539767.0, + "step": 21036 + }, + { + "epoch": 2.6761226307085613, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.683775544166565, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8793009519577026, + "num_tokens": 802575157.0, + "step": 21037 + }, + { + "epoch": 2.676249840987152, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.477005958557129, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8775167465209961, + "num_tokens": 802619080.0, + "step": 21038 + }, + { + "epoch": 2.6763770512657423, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6728768348693848, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.86025071144104, + "num_tokens": 802658151.0, + "step": 21039 + }, + { + "epoch": 2.676504261544333, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.774418592453003, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8662170767784119, + "num_tokens": 802688424.0, + "step": 21040 + }, + { + "epoch": 2.6766314718229234, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.639614224433899, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8826944828033447, + "num_tokens": 802724334.0, + "step": 21041 + }, + { + "epoch": 2.676758682101514, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7321950197219849, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8667544722557068, + "num_tokens": 802761097.0, + "step": 21042 + }, + { + "epoch": 2.6768858923801044, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6751855611801147, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8807632327079773, + "num_tokens": 802797791.0, + "step": 21043 + }, + { + "epoch": 2.677013102658695, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6629595756530762, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8756561279296875, + "num_tokens": 802833598.0, + "step": 21044 + }, + { + "epoch": 2.6771403129372855, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7028850317001343, + "learning_rate": 1e-06, + "loss": 0.3594, + "mean_token_accuracy": 0.8848877549171448, + "num_tokens": 802869975.0, + "step": 21045 + }, + { + "epoch": 2.6772675232158756, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6649127006530762, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8651918768882751, + "num_tokens": 802908525.0, + "step": 21046 + }, + { + "epoch": 2.6773947334944665, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6969475746154785, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8761051893234253, + "num_tokens": 802943842.0, + "step": 21047 + }, + { + "epoch": 2.6775219437730566, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6498665809631348, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8681958913803101, + "num_tokens": 802983996.0, + "step": 21048 + }, + { + "epoch": 2.6776491540516476, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7384440898895264, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8842422962188721, + "num_tokens": 803015573.0, + "step": 21049 + }, + { + "epoch": 2.6777763643302377, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7626768350601196, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.877187967300415, + "num_tokens": 803049815.0, + "step": 21050 + }, + { + "epoch": 2.6779035746088287, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.55488920211792, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8648240566253662, + "num_tokens": 803092449.0, + "step": 21051 + }, + { + "epoch": 2.6780307848874187, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7448354959487915, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8752774000167847, + "num_tokens": 803123920.0, + "step": 21052 + }, + { + "epoch": 2.6781579951660093, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6438535451889038, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8657139539718628, + "num_tokens": 803164604.0, + "step": 21053 + }, + { + "epoch": 2.6782852054446, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5293142795562744, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8599069118499756, + "num_tokens": 803207390.0, + "step": 21054 + }, + { + "epoch": 2.6784124157231903, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5092031955718994, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8707695603370667, + "num_tokens": 803253504.0, + "step": 21055 + }, + { + "epoch": 2.678539626001781, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.709247350692749, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8769502639770508, + "num_tokens": 803287458.0, + "step": 21056 + }, + { + "epoch": 2.6786668362803714, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6786761283874512, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.859606146812439, + "num_tokens": 803327863.0, + "step": 21057 + }, + { + "epoch": 2.678794046558962, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6965941190719604, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8559396862983704, + "num_tokens": 803368050.0, + "step": 21058 + }, + { + "epoch": 2.6789212568375524, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.527573585510254, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8861221671104431, + "num_tokens": 803408349.0, + "step": 21059 + }, + { + "epoch": 2.679048467116143, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7077571153640747, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8742340207099915, + "num_tokens": 803445525.0, + "step": 21060 + }, + { + "epoch": 2.6791756773947335, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.602909803390503, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8774285912513733, + "num_tokens": 803486043.0, + "step": 21061 + }, + { + "epoch": 2.679302887673324, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.708785057067871, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8665841221809387, + "num_tokens": 803524000.0, + "step": 21062 + }, + { + "epoch": 2.6794300979519146, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7578948736190796, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8706488609313965, + "num_tokens": 803558036.0, + "step": 21063 + }, + { + "epoch": 2.679557308230505, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.595694661140442, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.882673442363739, + "num_tokens": 803593888.0, + "step": 21064 + }, + { + "epoch": 2.6796845185090956, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5165479183197021, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8819737434387207, + "num_tokens": 803637758.0, + "step": 21065 + }, + { + "epoch": 2.679811728787686, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.618051528930664, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8699726462364197, + "num_tokens": 803677760.0, + "step": 21066 + }, + { + "epoch": 2.6799389390662767, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.550894856452942, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8816055059432983, + "num_tokens": 803717355.0, + "step": 21067 + }, + { + "epoch": 2.680066149344867, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7410252094268799, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8826651573181152, + "num_tokens": 803752904.0, + "step": 21068 + }, + { + "epoch": 2.6801933596234577, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7024593353271484, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8521841764450073, + "num_tokens": 803791404.0, + "step": 21069 + }, + { + "epoch": 2.6803205699020483, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6857999563217163, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8759704232215881, + "num_tokens": 803826803.0, + "step": 21070 + }, + { + "epoch": 2.6804477801806383, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5503325462341309, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8560075163841248, + "num_tokens": 803872679.0, + "step": 21071 + }, + { + "epoch": 2.6805749904592293, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7985544204711914, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8826327919960022, + "num_tokens": 803911036.0, + "step": 21072 + }, + { + "epoch": 2.6807022007378194, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.620758056640625, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8798825144767761, + "num_tokens": 803950208.0, + "step": 21073 + }, + { + "epoch": 2.6808294110164104, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6219227313995361, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8862836956977844, + "num_tokens": 803987351.0, + "step": 21074 + }, + { + "epoch": 2.6809566212950005, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5580497980117798, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8952039480209351, + "num_tokens": 804026546.0, + "step": 21075 + }, + { + "epoch": 2.681083831573591, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6341524124145508, + "learning_rate": 1e-06, + "loss": 0.3533, + "mean_token_accuracy": 0.8851799964904785, + "num_tokens": 804063402.0, + "step": 21076 + }, + { + "epoch": 2.6812110418521815, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5972956418991089, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8691613674163818, + "num_tokens": 804102960.0, + "step": 21077 + }, + { + "epoch": 2.681338252130772, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7728691101074219, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.865566611289978, + "num_tokens": 804141127.0, + "step": 21078 + }, + { + "epoch": 2.6814654624093626, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7717262506484985, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8703360557556152, + "num_tokens": 804177675.0, + "step": 21079 + }, + { + "epoch": 2.681592672687953, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6727303266525269, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8780136108398438, + "num_tokens": 804216870.0, + "step": 21080 + }, + { + "epoch": 2.6817198829665436, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6931735277175903, + "learning_rate": 1e-06, + "loss": 0.3233, + "mean_token_accuracy": 0.8933691382408142, + "num_tokens": 804252431.0, + "step": 21081 + }, + { + "epoch": 2.681847093245134, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6218488216400146, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8655474185943604, + "num_tokens": 804295202.0, + "step": 21082 + }, + { + "epoch": 2.6819743035237247, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6602612733840942, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8697733879089355, + "num_tokens": 804329732.0, + "step": 21083 + }, + { + "epoch": 2.682101513802315, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7025808095932007, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.867369532585144, + "num_tokens": 804370904.0, + "step": 21084 + }, + { + "epoch": 2.6822287240809057, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.763129472732544, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.8877980709075928, + "num_tokens": 804404633.0, + "step": 21085 + }, + { + "epoch": 2.6823559343594963, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.8029894828796387, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8811799883842468, + "num_tokens": 804434846.0, + "step": 21086 + }, + { + "epoch": 2.682483144638087, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7245906591415405, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8836386203765869, + "num_tokens": 804467627.0, + "step": 21087 + }, + { + "epoch": 2.6826103549166773, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.631952166557312, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8701860308647156, + "num_tokens": 804511278.0, + "step": 21088 + }, + { + "epoch": 2.682737565195268, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5897181034088135, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8804804086685181, + "num_tokens": 804550505.0, + "step": 21089 + }, + { + "epoch": 2.6828647754738584, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.618194818496704, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8699567317962646, + "num_tokens": 804591426.0, + "step": 21090 + }, + { + "epoch": 2.682991985752449, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6621588468551636, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8817417621612549, + "num_tokens": 804626337.0, + "step": 21091 + }, + { + "epoch": 2.6831191960310394, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7162343263626099, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8750274777412415, + "num_tokens": 804665028.0, + "step": 21092 + }, + { + "epoch": 2.68324640630963, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6188552379608154, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8679152131080627, + "num_tokens": 804707937.0, + "step": 21093 + }, + { + "epoch": 2.6833736165882205, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5705888271331787, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8814646601676941, + "num_tokens": 804745667.0, + "step": 21094 + }, + { + "epoch": 2.683500826866811, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.8009519577026367, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8686834573745728, + "num_tokens": 804781253.0, + "step": 21095 + }, + { + "epoch": 2.683628037145401, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.836580514907837, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8759455680847168, + "num_tokens": 804816040.0, + "step": 21096 + }, + { + "epoch": 2.683755247423992, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6489078998565674, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.882230818271637, + "num_tokens": 804851625.0, + "step": 21097 + }, + { + "epoch": 2.683882457702582, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6573435068130493, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8743418455123901, + "num_tokens": 804888823.0, + "step": 21098 + }, + { + "epoch": 2.684009667981173, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5548008680343628, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.874823272228241, + "num_tokens": 804933113.0, + "step": 21099 + }, + { + "epoch": 2.684136878259763, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5056184530258179, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.879652738571167, + "num_tokens": 804976423.0, + "step": 21100 + }, + { + "epoch": 2.6842640885383537, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5324186086654663, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8766641020774841, + "num_tokens": 805015306.0, + "step": 21101 + }, + { + "epoch": 2.6843912988169443, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6952446699142456, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8841893076896667, + "num_tokens": 805054852.0, + "step": 21102 + }, + { + "epoch": 2.684518509095535, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5123742818832397, + "learning_rate": 1e-06, + "loss": 0.3537, + "mean_token_accuracy": 0.8871164321899414, + "num_tokens": 805094928.0, + "step": 21103 + }, + { + "epoch": 2.6846457193741253, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5590969324111938, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8789354562759399, + "num_tokens": 805133420.0, + "step": 21104 + }, + { + "epoch": 2.684772929652716, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5072599649429321, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8806300163269043, + "num_tokens": 805172713.0, + "step": 21105 + }, + { + "epoch": 2.6849001399313064, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.534825325012207, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8829075694084167, + "num_tokens": 805212127.0, + "step": 21106 + }, + { + "epoch": 2.685027350209897, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.609569787979126, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8788651823997498, + "num_tokens": 805248371.0, + "step": 21107 + }, + { + "epoch": 2.6851545604884874, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.569153904914856, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8826282024383545, + "num_tokens": 805289262.0, + "step": 21108 + }, + { + "epoch": 2.685281770767078, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7358747720718384, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8736823797225952, + "num_tokens": 805325698.0, + "step": 21109 + }, + { + "epoch": 2.6854089810456685, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5706768035888672, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8684625625610352, + "num_tokens": 805369439.0, + "step": 21110 + }, + { + "epoch": 2.685536191324259, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.646920084953308, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8754528164863586, + "num_tokens": 805405535.0, + "step": 21111 + }, + { + "epoch": 2.6856634016028496, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.674153447151184, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8707228302955627, + "num_tokens": 805443396.0, + "step": 21112 + }, + { + "epoch": 2.68579061188144, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7417380809783936, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8847278356552124, + "num_tokens": 805477838.0, + "step": 21113 + }, + { + "epoch": 2.6859178221600306, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5843734741210938, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8762513399124146, + "num_tokens": 805518322.0, + "step": 21114 + }, + { + "epoch": 2.686045032438621, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6400701999664307, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8610165119171143, + "num_tokens": 805558555.0, + "step": 21115 + }, + { + "epoch": 2.6861722427172117, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7475919723510742, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8606610298156738, + "num_tokens": 805593893.0, + "step": 21116 + }, + { + "epoch": 2.686299452995802, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.659346580505371, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8815525770187378, + "num_tokens": 805634225.0, + "step": 21117 + }, + { + "epoch": 2.6864266632743927, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5071133375167847, + "learning_rate": 1e-06, + "loss": 0.3512, + "mean_token_accuracy": 0.8881457448005676, + "num_tokens": 805674925.0, + "step": 21118 + }, + { + "epoch": 2.686553873552983, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6475459337234497, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8799864053726196, + "num_tokens": 805714298.0, + "step": 21119 + }, + { + "epoch": 2.686681083831574, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7148466110229492, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8699133992195129, + "num_tokens": 805753521.0, + "step": 21120 + }, + { + "epoch": 2.686808294110164, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5802336931228638, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8727126717567444, + "num_tokens": 805793771.0, + "step": 21121 + }, + { + "epoch": 2.686935504388755, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5174497365951538, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8685845136642456, + "num_tokens": 805836108.0, + "step": 21122 + }, + { + "epoch": 2.687062714667345, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.642011284828186, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8703980445861816, + "num_tokens": 805875819.0, + "step": 21123 + }, + { + "epoch": 2.687189924945936, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.8710190057754517, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8645491600036621, + "num_tokens": 805908648.0, + "step": 21124 + }, + { + "epoch": 2.687317135224526, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7337106466293335, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.865944504737854, + "num_tokens": 805948780.0, + "step": 21125 + }, + { + "epoch": 2.6874443455031165, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5970041751861572, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8825241923332214, + "num_tokens": 805987854.0, + "step": 21126 + }, + { + "epoch": 2.687571555781707, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.626417636871338, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8767040967941284, + "num_tokens": 806027355.0, + "step": 21127 + }, + { + "epoch": 2.6876987660602976, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6297310590744019, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.865567684173584, + "num_tokens": 806069945.0, + "step": 21128 + }, + { + "epoch": 2.687825976338888, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.620080828666687, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8840401768684387, + "num_tokens": 806108807.0, + "step": 21129 + }, + { + "epoch": 2.6879531866174786, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.4740087985992432, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8851532936096191, + "num_tokens": 806149542.0, + "step": 21130 + }, + { + "epoch": 2.688080396896069, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5106191635131836, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8672562837600708, + "num_tokens": 806194700.0, + "step": 21131 + }, + { + "epoch": 2.6882076071746597, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7142072916030884, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.880088210105896, + "num_tokens": 806231867.0, + "step": 21132 + }, + { + "epoch": 2.68833481745325, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.740531325340271, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8664103746414185, + "num_tokens": 806271217.0, + "step": 21133 + }, + { + "epoch": 2.6884620277318407, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7367619276046753, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8644455075263977, + "num_tokens": 806310463.0, + "step": 21134 + }, + { + "epoch": 2.6885892380104313, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.671700119972229, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8777260780334473, + "num_tokens": 806347960.0, + "step": 21135 + }, + { + "epoch": 2.688716448289022, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6470165252685547, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8698772192001343, + "num_tokens": 806387572.0, + "step": 21136 + }, + { + "epoch": 2.6888436585676123, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5106242895126343, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8786430358886719, + "num_tokens": 806430441.0, + "step": 21137 + }, + { + "epoch": 2.688970868846203, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.583832859992981, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8890082836151123, + "num_tokens": 806469194.0, + "step": 21138 + }, + { + "epoch": 2.6890980791247934, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6326173543930054, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.878254234790802, + "num_tokens": 806505722.0, + "step": 21139 + }, + { + "epoch": 2.689225289403384, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6589957475662231, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.871291995048523, + "num_tokens": 806541919.0, + "step": 21140 + }, + { + "epoch": 2.6893524996819744, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6955633163452148, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8786514401435852, + "num_tokens": 806579894.0, + "step": 21141 + }, + { + "epoch": 2.689479709960565, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6450321674346924, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8813667297363281, + "num_tokens": 806617489.0, + "step": 21142 + }, + { + "epoch": 2.6896069202391555, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7006250619888306, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8692653179168701, + "num_tokens": 806657542.0, + "step": 21143 + }, + { + "epoch": 2.6897341305177456, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7272964715957642, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8677365779876709, + "num_tokens": 806692015.0, + "step": 21144 + }, + { + "epoch": 2.6898613407963365, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7263377904891968, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8879832029342651, + "num_tokens": 806725017.0, + "step": 21145 + }, + { + "epoch": 2.6899885510749266, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6315908432006836, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.875922679901123, + "num_tokens": 806765326.0, + "step": 21146 + }, + { + "epoch": 2.6901157613535176, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7886942625045776, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8638910055160522, + "num_tokens": 806804184.0, + "step": 21147 + }, + { + "epoch": 2.6902429716321077, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.8207815885543823, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8816072344779968, + "num_tokens": 806840016.0, + "step": 21148 + }, + { + "epoch": 2.6903701819106987, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6120198965072632, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8718330264091492, + "num_tokens": 806883858.0, + "step": 21149 + }, + { + "epoch": 2.6904973921892887, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5693095922470093, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8729236721992493, + "num_tokens": 806925281.0, + "step": 21150 + }, + { + "epoch": 2.6906246024678793, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6319869756698608, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8766248226165771, + "num_tokens": 806961833.0, + "step": 21151 + }, + { + "epoch": 2.69075181274647, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.625620722770691, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8886911869049072, + "num_tokens": 806998358.0, + "step": 21152 + }, + { + "epoch": 2.6908790230250603, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5676789283752441, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8734790086746216, + "num_tokens": 807043203.0, + "step": 21153 + }, + { + "epoch": 2.691006233303651, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7729862928390503, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8708428144454956, + "num_tokens": 807078387.0, + "step": 21154 + }, + { + "epoch": 2.6911334435822414, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6050959825515747, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8712033629417419, + "num_tokens": 807119153.0, + "step": 21155 + }, + { + "epoch": 2.691260653860832, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6916595697402954, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.862787663936615, + "num_tokens": 807155648.0, + "step": 21156 + }, + { + "epoch": 2.6913878641394224, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.556462287902832, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.874773383140564, + "num_tokens": 807196643.0, + "step": 21157 + }, + { + "epoch": 2.691515074418013, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5627435445785522, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8825769424438477, + "num_tokens": 807238738.0, + "step": 21158 + }, + { + "epoch": 2.6916422846966035, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5139579772949219, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8795782327651978, + "num_tokens": 807279797.0, + "step": 21159 + }, + { + "epoch": 2.691769494975194, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5940680503845215, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8802645206451416, + "num_tokens": 807318996.0, + "step": 21160 + }, + { + "epoch": 2.6918967052537845, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5462754964828491, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.879521369934082, + "num_tokens": 807359467.0, + "step": 21161 + }, + { + "epoch": 2.692023915532375, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.663155198097229, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8913947343826294, + "num_tokens": 807398303.0, + "step": 21162 + }, + { + "epoch": 2.6921511258109656, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6207741498947144, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8743711709976196, + "num_tokens": 807435839.0, + "step": 21163 + }, + { + "epoch": 2.692278336089556, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5287864208221436, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8785646557807922, + "num_tokens": 807480208.0, + "step": 21164 + }, + { + "epoch": 2.6924055463681467, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7050758600234985, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8753325939178467, + "num_tokens": 807516784.0, + "step": 21165 + }, + { + "epoch": 2.692532756646737, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6516172885894775, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8806114792823792, + "num_tokens": 807550722.0, + "step": 21166 + }, + { + "epoch": 2.6926599669253277, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6849581003189087, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.878716230392456, + "num_tokens": 807587516.0, + "step": 21167 + }, + { + "epoch": 2.6927871772039182, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7689704895019531, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8664264678955078, + "num_tokens": 807622190.0, + "step": 21168 + }, + { + "epoch": 2.6929143874825083, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6619369983673096, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.875624418258667, + "num_tokens": 807659457.0, + "step": 21169 + }, + { + "epoch": 2.6930415977610993, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6685229539871216, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.876156747341156, + "num_tokens": 807694658.0, + "step": 21170 + }, + { + "epoch": 2.6931688080396894, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7858515977859497, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8801888823509216, + "num_tokens": 807728856.0, + "step": 21171 + }, + { + "epoch": 2.6932960183182804, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.536397933959961, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8811080455780029, + "num_tokens": 807769848.0, + "step": 21172 + }, + { + "epoch": 2.6934232285968704, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5869970321655273, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8793294429779053, + "num_tokens": 807809045.0, + "step": 21173 + }, + { + "epoch": 2.693550438875461, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5465340614318848, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8841449022293091, + "num_tokens": 807848671.0, + "step": 21174 + }, + { + "epoch": 2.6936776491540515, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6698311567306519, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8728832006454468, + "num_tokens": 807888680.0, + "step": 21175 + }, + { + "epoch": 2.693804859432642, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6146305799484253, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8809311389923096, + "num_tokens": 807927655.0, + "step": 21176 + }, + { + "epoch": 2.6939320697112326, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5601284503936768, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.869776725769043, + "num_tokens": 807972648.0, + "step": 21177 + }, + { + "epoch": 2.694059279989823, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6197677850723267, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.88072270154953, + "num_tokens": 808009846.0, + "step": 21178 + }, + { + "epoch": 2.6941864902684136, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5773288011550903, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8665869235992432, + "num_tokens": 808050240.0, + "step": 21179 + }, + { + "epoch": 2.694313700547004, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7813401222229004, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.868794858455658, + "num_tokens": 808083160.0, + "step": 21180 + }, + { + "epoch": 2.6944409108255947, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6981618404388428, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8583534955978394, + "num_tokens": 808124361.0, + "step": 21181 + }, + { + "epoch": 2.694568121104185, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5017101764678955, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8768371939659119, + "num_tokens": 808168456.0, + "step": 21182 + }, + { + "epoch": 2.6946953313827757, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7304949760437012, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.861127495765686, + "num_tokens": 808206912.0, + "step": 21183 + }, + { + "epoch": 2.6948225416613663, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.529958963394165, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8664127588272095, + "num_tokens": 808252202.0, + "step": 21184 + }, + { + "epoch": 2.694949751939957, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7125414609909058, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8754880428314209, + "num_tokens": 808289593.0, + "step": 21185 + }, + { + "epoch": 2.6950769622185473, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.5362354516983032, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8772728443145752, + "num_tokens": 808327381.0, + "step": 21186 + }, + { + "epoch": 2.695204172497138, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.59225594997406, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8687620759010315, + "num_tokens": 808365387.0, + "step": 21187 + }, + { + "epoch": 2.6953313827757284, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8452682495117188, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8497458696365356, + "num_tokens": 808398172.0, + "step": 21188 + }, + { + "epoch": 2.695458593054319, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7082502841949463, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8720375895500183, + "num_tokens": 808433090.0, + "step": 21189 + }, + { + "epoch": 2.6955858033329094, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8051913976669312, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8557549118995667, + "num_tokens": 808465667.0, + "step": 21190 + }, + { + "epoch": 2.6957130136115, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.595771074295044, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.887854814529419, + "num_tokens": 808502814.0, + "step": 21191 + }, + { + "epoch": 2.6958402238900905, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.4974095821380615, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8763859868049622, + "num_tokens": 808545630.0, + "step": 21192 + }, + { + "epoch": 2.695967434168681, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6919517517089844, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8832045793533325, + "num_tokens": 808581614.0, + "step": 21193 + }, + { + "epoch": 2.696094644447271, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.7370924949645996, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8601908087730408, + "num_tokens": 808619779.0, + "step": 21194 + }, + { + "epoch": 2.696221854725862, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.651658535003662, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8710511922836304, + "num_tokens": 808659023.0, + "step": 21195 + }, + { + "epoch": 2.696349065004452, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.607351303100586, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8770302534103394, + "num_tokens": 808697016.0, + "step": 21196 + }, + { + "epoch": 2.696476275283043, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.720481038093567, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8776654005050659, + "num_tokens": 808731054.0, + "step": 21197 + }, + { + "epoch": 2.696603485561633, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6912691593170166, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8823543787002563, + "num_tokens": 808767736.0, + "step": 21198 + }, + { + "epoch": 2.6967306958402237, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6705604791641235, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8714628219604492, + "num_tokens": 808806128.0, + "step": 21199 + }, + { + "epoch": 2.6968579061188143, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5042001008987427, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.873945951461792, + "num_tokens": 808850916.0, + "step": 21200 + }, + { + "epoch": 2.696985116397405, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7412092685699463, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8602423667907715, + "num_tokens": 808886049.0, + "step": 21201 + }, + { + "epoch": 2.6971123266759953, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5740951299667358, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.876812219619751, + "num_tokens": 808926818.0, + "step": 21202 + }, + { + "epoch": 2.697239536954586, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.798545479774475, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8709926009178162, + "num_tokens": 808955739.0, + "step": 21203 + }, + { + "epoch": 2.6973667472331764, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7329086065292358, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8707424998283386, + "num_tokens": 808994078.0, + "step": 21204 + }, + { + "epoch": 2.697493957511767, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5579571723937988, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.883528470993042, + "num_tokens": 809030897.0, + "step": 21205 + }, + { + "epoch": 2.6976211677903574, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5613476037979126, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8900847434997559, + "num_tokens": 809076597.0, + "step": 21206 + }, + { + "epoch": 2.697748378068948, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6438877582550049, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8621413707733154, + "num_tokens": 809117414.0, + "step": 21207 + }, + { + "epoch": 2.6978755883475385, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5423794984817505, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8835713863372803, + "num_tokens": 809157896.0, + "step": 21208 + }, + { + "epoch": 2.698002798626129, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.622313141822815, + "learning_rate": 1e-06, + "loss": 0.3438, + "mean_token_accuracy": 0.8880138397216797, + "num_tokens": 809194066.0, + "step": 21209 + }, + { + "epoch": 2.6981300089047195, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6496964693069458, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.872270941734314, + "num_tokens": 809233671.0, + "step": 21210 + }, + { + "epoch": 2.69825721918331, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6042897701263428, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8782263994216919, + "num_tokens": 809270813.0, + "step": 21211 + }, + { + "epoch": 2.6983844294619006, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.576324701309204, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8760511875152588, + "num_tokens": 809315687.0, + "step": 21212 + }, + { + "epoch": 2.698511639740491, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.708432674407959, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8704512715339661, + "num_tokens": 809350781.0, + "step": 21213 + }, + { + "epoch": 2.6986388500190817, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6951448917388916, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8820000886917114, + "num_tokens": 809384371.0, + "step": 21214 + }, + { + "epoch": 2.698766060297672, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6335395574569702, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8739911913871765, + "num_tokens": 809420812.0, + "step": 21215 + }, + { + "epoch": 2.6988932705762627, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.592116117477417, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8852369785308838, + "num_tokens": 809458815.0, + "step": 21216 + }, + { + "epoch": 2.699020480854853, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6477032899856567, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8738212585449219, + "num_tokens": 809494987.0, + "step": 21217 + }, + { + "epoch": 2.6991476911334438, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6949834823608398, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8529036045074463, + "num_tokens": 809537523.0, + "step": 21218 + }, + { + "epoch": 2.699274901412034, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.6444646120071411, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.875691831111908, + "num_tokens": 809576033.0, + "step": 21219 + }, + { + "epoch": 2.699402111690625, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6797051429748535, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8560512661933899, + "num_tokens": 809613821.0, + "step": 21220 + }, + { + "epoch": 2.699529321969215, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6684131622314453, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8747463226318359, + "num_tokens": 809650102.0, + "step": 21221 + }, + { + "epoch": 2.699656532247806, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.8691221475601196, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8765994906425476, + "num_tokens": 809691046.0, + "step": 21222 + }, + { + "epoch": 2.699783742526396, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5955913066864014, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8829292058944702, + "num_tokens": 809725378.0, + "step": 21223 + }, + { + "epoch": 2.6999109528049865, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.855819821357727, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8737474083900452, + "num_tokens": 809758727.0, + "step": 21224 + }, + { + "epoch": 2.700038163083577, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5956579446792603, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.870063066482544, + "num_tokens": 809800000.0, + "step": 21225 + }, + { + "epoch": 2.7001653733621676, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5784001350402832, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8690183162689209, + "num_tokens": 809840661.0, + "step": 21226 + }, + { + "epoch": 2.700292583640758, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7807766199111938, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8891952037811279, + "num_tokens": 809872866.0, + "step": 21227 + }, + { + "epoch": 2.7004197939193486, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7374937534332275, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8667875528335571, + "num_tokens": 809912047.0, + "step": 21228 + }, + { + "epoch": 2.700547004197939, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7567673921585083, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8721779584884644, + "num_tokens": 809951913.0, + "step": 21229 + }, + { + "epoch": 2.7006742144765297, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6270549297332764, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8725168704986572, + "num_tokens": 809991703.0, + "step": 21230 + }, + { + "epoch": 2.70080142475512, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6035449504852295, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8773524165153503, + "num_tokens": 810033483.0, + "step": 21231 + }, + { + "epoch": 2.7009286350337107, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7679611444473267, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8805614113807678, + "num_tokens": 810065000.0, + "step": 21232 + }, + { + "epoch": 2.7010558453123013, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5467499494552612, + "learning_rate": 1e-06, + "loss": 0.3417, + "mean_token_accuracy": 0.8869432210922241, + "num_tokens": 810103383.0, + "step": 21233 + }, + { + "epoch": 2.701183055590892, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5460349321365356, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8560027480125427, + "num_tokens": 810146141.0, + "step": 21234 + }, + { + "epoch": 2.7013102658694823, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.619964838027954, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8717480897903442, + "num_tokens": 810191595.0, + "step": 21235 + }, + { + "epoch": 2.701437476148073, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5987269878387451, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8774229884147644, + "num_tokens": 810230251.0, + "step": 21236 + }, + { + "epoch": 2.7015646864266634, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5264782905578613, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.879375696182251, + "num_tokens": 810270364.0, + "step": 21237 + }, + { + "epoch": 2.701691896705254, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6397719383239746, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.876287579536438, + "num_tokens": 810308871.0, + "step": 21238 + }, + { + "epoch": 2.7018191069838444, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6412676572799683, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.88358074426651, + "num_tokens": 810345308.0, + "step": 21239 + }, + { + "epoch": 2.701946317262435, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6592621803283691, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8853111267089844, + "num_tokens": 810379037.0, + "step": 21240 + }, + { + "epoch": 2.7020735275410255, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6149888038635254, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8760375380516052, + "num_tokens": 810418232.0, + "step": 21241 + }, + { + "epoch": 2.7022007378196156, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5599232912063599, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8819237947463989, + "num_tokens": 810456159.0, + "step": 21242 + }, + { + "epoch": 2.7023279480982065, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6406590938568115, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8755103349685669, + "num_tokens": 810496717.0, + "step": 21243 + }, + { + "epoch": 2.7024551583767966, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7969837188720703, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8657960891723633, + "num_tokens": 810530266.0, + "step": 21244 + }, + { + "epoch": 2.7025823686553876, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5908938646316528, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8694748282432556, + "num_tokens": 810569536.0, + "step": 21245 + }, + { + "epoch": 2.7027095789339777, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7731995582580566, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8755236864089966, + "num_tokens": 810604296.0, + "step": 21246 + }, + { + "epoch": 2.7028367892125686, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6107170581817627, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8762651681900024, + "num_tokens": 810642452.0, + "step": 21247 + }, + { + "epoch": 2.7029639994911587, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7312275171279907, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8741870522499084, + "num_tokens": 810676915.0, + "step": 21248 + }, + { + "epoch": 2.7030912097697493, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 2.0728042125701904, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.871366024017334, + "num_tokens": 810711028.0, + "step": 21249 + }, + { + "epoch": 2.70321842004834, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6389377117156982, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8765099048614502, + "num_tokens": 810749818.0, + "step": 21250 + }, + { + "epoch": 2.7033456303269303, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7236872911453247, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8694037795066833, + "num_tokens": 810783486.0, + "step": 21251 + }, + { + "epoch": 2.703472840605521, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4693336486816406, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8895277380943298, + "num_tokens": 810827732.0, + "step": 21252 + }, + { + "epoch": 2.7036000508841114, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.685896396636963, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8604225516319275, + "num_tokens": 810867496.0, + "step": 21253 + }, + { + "epoch": 2.703727261162702, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6603964567184448, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8752773404121399, + "num_tokens": 810906874.0, + "step": 21254 + }, + { + "epoch": 2.7038544714412924, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.496443510055542, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8791413307189941, + "num_tokens": 810950765.0, + "step": 21255 + }, + { + "epoch": 2.703981681719883, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.715419888496399, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8606643676757812, + "num_tokens": 810988782.0, + "step": 21256 + }, + { + "epoch": 2.7041088919984735, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7655891180038452, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8759600520133972, + "num_tokens": 811021165.0, + "step": 21257 + }, + { + "epoch": 2.704236102277064, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.533010482788086, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8815838098526001, + "num_tokens": 811059603.0, + "step": 21258 + }, + { + "epoch": 2.7043633125556545, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.537522315979004, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.872350811958313, + "num_tokens": 811103719.0, + "step": 21259 + }, + { + "epoch": 2.704490522834245, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5526992082595825, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8737616539001465, + "num_tokens": 811147168.0, + "step": 21260 + }, + { + "epoch": 2.7046177331128356, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6783969402313232, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8895444869995117, + "num_tokens": 811180536.0, + "step": 21261 + }, + { + "epoch": 2.704744943391426, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5997170209884644, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8759292960166931, + "num_tokens": 811218481.0, + "step": 21262 + }, + { + "epoch": 2.7048721536700167, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6282968521118164, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8700916171073914, + "num_tokens": 811255422.0, + "step": 21263 + }, + { + "epoch": 2.704999363948607, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5753995180130005, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.8955445289611816, + "num_tokens": 811293960.0, + "step": 21264 + }, + { + "epoch": 2.7051265742271977, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5811066627502441, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8868945240974426, + "num_tokens": 811335236.0, + "step": 21265 + }, + { + "epoch": 2.7052537845057882, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6577401161193848, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8766422867774963, + "num_tokens": 811376381.0, + "step": 21266 + }, + { + "epoch": 2.7053809947843783, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7144744396209717, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8743041753768921, + "num_tokens": 811416706.0, + "step": 21267 + }, + { + "epoch": 2.7055082050629693, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5045490264892578, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8776165843009949, + "num_tokens": 811460322.0, + "step": 21268 + }, + { + "epoch": 2.7056354153415594, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6150989532470703, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8783642649650574, + "num_tokens": 811501078.0, + "step": 21269 + }, + { + "epoch": 2.7057626256201504, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6236340999603271, + "learning_rate": 1e-06, + "loss": 0.3392, + "mean_token_accuracy": 0.8914999961853027, + "num_tokens": 811534959.0, + "step": 21270 + }, + { + "epoch": 2.7058898358987404, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6272616386413574, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8847165107727051, + "num_tokens": 811570283.0, + "step": 21271 + }, + { + "epoch": 2.706017046177331, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7371703386306763, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8718146681785583, + "num_tokens": 811605025.0, + "step": 21272 + }, + { + "epoch": 2.7061442564559215, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6142691373825073, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8735159039497375, + "num_tokens": 811643207.0, + "step": 21273 + }, + { + "epoch": 2.706271466734512, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.619842767715454, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8625853061676025, + "num_tokens": 811682143.0, + "step": 21274 + }, + { + "epoch": 2.7063986770131025, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5917409658432007, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8762081265449524, + "num_tokens": 811721551.0, + "step": 21275 + }, + { + "epoch": 2.706525887291693, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7048232555389404, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.866712212562561, + "num_tokens": 811756894.0, + "step": 21276 + }, + { + "epoch": 2.7066530975702836, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5762532949447632, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8777722716331482, + "num_tokens": 811795549.0, + "step": 21277 + }, + { + "epoch": 2.706780307848874, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.494160771369934, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8695310354232788, + "num_tokens": 811840296.0, + "step": 21278 + }, + { + "epoch": 2.7069075181274647, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.4940464496612549, + "learning_rate": 1e-06, + "loss": 0.334, + "mean_token_accuracy": 0.8932013511657715, + "num_tokens": 811877750.0, + "step": 21279 + }, + { + "epoch": 2.707034728406055, + "ewc_loss": 2.8967857360839844e-05, + "grad_norm": 1.8623096942901611, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8830353617668152, + "num_tokens": 811902985.0, + "step": 21280 + }, + { + "epoch": 2.7071619386846457, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7141605615615845, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8707835674285889, + "num_tokens": 811939756.0, + "step": 21281 + }, + { + "epoch": 2.7072891489632362, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.7382172346115112, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8769090175628662, + "num_tokens": 811978332.0, + "step": 21282 + }, + { + "epoch": 2.7074163592418268, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.5860233306884766, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8750771880149841, + "num_tokens": 812022412.0, + "step": 21283 + }, + { + "epoch": 2.7075435695204173, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6363434791564941, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8766703605651855, + "num_tokens": 812059919.0, + "step": 21284 + }, + { + "epoch": 2.707670779799008, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.64487624168396, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8678703904151917, + "num_tokens": 812098996.0, + "step": 21285 + }, + { + "epoch": 2.7077979900775984, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.8063336610794067, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8636290431022644, + "num_tokens": 812134207.0, + "step": 21286 + }, + { + "epoch": 2.707925200356189, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.6366595029830933, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.884010374546051, + "num_tokens": 812169231.0, + "step": 21287 + }, + { + "epoch": 2.7080524106347794, + "ewc_loss": 2.9087066650390625e-05, + "grad_norm": 1.601645588874817, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8781276941299438, + "num_tokens": 812209533.0, + "step": 21288 + }, + { + "epoch": 2.70817962091337, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8380792140960693, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8613675832748413, + "num_tokens": 812241118.0, + "step": 21289 + }, + { + "epoch": 2.7083068311919605, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7249469757080078, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8833518624305725, + "num_tokens": 812276325.0, + "step": 21290 + }, + { + "epoch": 2.708434041470551, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.580763816833496, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8809918761253357, + "num_tokens": 812319671.0, + "step": 21291 + }, + { + "epoch": 2.708561251749141, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7912554740905762, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8678268194198608, + "num_tokens": 812352916.0, + "step": 21292 + }, + { + "epoch": 2.708688462027732, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5981049537658691, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8744888305664062, + "num_tokens": 812392040.0, + "step": 21293 + }, + { + "epoch": 2.708815672306322, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6383867263793945, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8695011138916016, + "num_tokens": 812429002.0, + "step": 21294 + }, + { + "epoch": 2.708942882584913, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.696512222290039, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8798326849937439, + "num_tokens": 812464133.0, + "step": 21295 + }, + { + "epoch": 2.709070092863503, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.666670799255371, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8822877407073975, + "num_tokens": 812503143.0, + "step": 21296 + }, + { + "epoch": 2.7091973031420937, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.580369234085083, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.8845295906066895, + "num_tokens": 812546247.0, + "step": 21297 + }, + { + "epoch": 2.7093245134206843, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7067346572875977, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8743716478347778, + "num_tokens": 812583881.0, + "step": 21298 + }, + { + "epoch": 2.709451723699275, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6819270849227905, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8749470114707947, + "num_tokens": 812618892.0, + "step": 21299 + }, + { + "epoch": 2.7095789339778653, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.80680251121521, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8847278952598572, + "num_tokens": 812650277.0, + "step": 21300 + }, + { + "epoch": 2.709706144256456, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6019141674041748, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8811647891998291, + "num_tokens": 812690909.0, + "step": 21301 + }, + { + "epoch": 2.7098333545350464, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6367621421813965, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8642303943634033, + "num_tokens": 812731749.0, + "step": 21302 + }, + { + "epoch": 2.709960564813637, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.636720895767212, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8805861473083496, + "num_tokens": 812769861.0, + "step": 21303 + }, + { + "epoch": 2.7100877750922274, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5396592617034912, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.88104248046875, + "num_tokens": 812807759.0, + "step": 21304 + }, + { + "epoch": 2.710214985370818, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6335694789886475, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8694977760314941, + "num_tokens": 812849477.0, + "step": 21305 + }, + { + "epoch": 2.7103421956494085, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.654834270477295, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8951129913330078, + "num_tokens": 812883109.0, + "step": 21306 + }, + { + "epoch": 2.710469405927999, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6633529663085938, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8769088387489319, + "num_tokens": 812915580.0, + "step": 21307 + }, + { + "epoch": 2.7105966162065895, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7758550643920898, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8655983209609985, + "num_tokens": 812950447.0, + "step": 21308 + }, + { + "epoch": 2.71072382648518, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5339559316635132, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.880874752998352, + "num_tokens": 812993383.0, + "step": 21309 + }, + { + "epoch": 2.7108510367637706, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5613689422607422, + "learning_rate": 1e-06, + "loss": 0.3329, + "mean_token_accuracy": 0.8895313739776611, + "num_tokens": 813031850.0, + "step": 21310 + }, + { + "epoch": 2.710978247042361, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6092755794525146, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8736852407455444, + "num_tokens": 813067859.0, + "step": 21311 + }, + { + "epoch": 2.7111054573209517, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5739939212799072, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8828074336051941, + "num_tokens": 813105646.0, + "step": 21312 + }, + { + "epoch": 2.711232667599542, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5752125978469849, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.881004810333252, + "num_tokens": 813144467.0, + "step": 21313 + }, + { + "epoch": 2.7113598778781327, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6789129972457886, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8848523497581482, + "num_tokens": 813178536.0, + "step": 21314 + }, + { + "epoch": 2.711487088156723, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.551190972328186, + "learning_rate": 1e-06, + "loss": 0.3475, + "mean_token_accuracy": 0.8865845203399658, + "num_tokens": 813216315.0, + "step": 21315 + }, + { + "epoch": 2.7116142984353138, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.473893165588379, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8748340606689453, + "num_tokens": 813261139.0, + "step": 21316 + }, + { + "epoch": 2.711741508713904, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.660770058631897, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8705963492393494, + "num_tokens": 813297938.0, + "step": 21317 + }, + { + "epoch": 2.711868718992495, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.534035086631775, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8790618181228638, + "num_tokens": 813339485.0, + "step": 21318 + }, + { + "epoch": 2.711995929271085, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6948732137680054, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8608201742172241, + "num_tokens": 813376519.0, + "step": 21319 + }, + { + "epoch": 2.712123139549676, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5953277349472046, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8743065595626831, + "num_tokens": 813415732.0, + "step": 21320 + }, + { + "epoch": 2.712250349828266, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.58139169216156, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8705931901931763, + "num_tokens": 813458115.0, + "step": 21321 + }, + { + "epoch": 2.7123775601068565, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7475926876068115, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8653005957603455, + "num_tokens": 813497225.0, + "step": 21322 + }, + { + "epoch": 2.712504770385447, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5958294868469238, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8691954016685486, + "num_tokens": 813534796.0, + "step": 21323 + }, + { + "epoch": 2.7126319806640375, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6775754690170288, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8844333291053772, + "num_tokens": 813567196.0, + "step": 21324 + }, + { + "epoch": 2.712759190942628, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7154780626296997, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8578723073005676, + "num_tokens": 813606059.0, + "step": 21325 + }, + { + "epoch": 2.7128864012212186, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5724503993988037, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8778190612792969, + "num_tokens": 813647462.0, + "step": 21326 + }, + { + "epoch": 2.713013611499809, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6566493511199951, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8873143196105957, + "num_tokens": 813689404.0, + "step": 21327 + }, + { + "epoch": 2.7131408217783997, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6547777652740479, + "learning_rate": 1e-06, + "loss": 0.3558, + "mean_token_accuracy": 0.8806421756744385, + "num_tokens": 813724464.0, + "step": 21328 + }, + { + "epoch": 2.71326803205699, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5462062358856201, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8839772939682007, + "num_tokens": 813762888.0, + "step": 21329 + }, + { + "epoch": 2.7133952423355807, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.611741542816162, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8757839798927307, + "num_tokens": 813803316.0, + "step": 21330 + }, + { + "epoch": 2.7135224526141712, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7109010219573975, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8551084399223328, + "num_tokens": 813838483.0, + "step": 21331 + }, + { + "epoch": 2.7136496628927618, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.783551573753357, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.858099102973938, + "num_tokens": 813873690.0, + "step": 21332 + }, + { + "epoch": 2.7137768731713523, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6590161323547363, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8743410110473633, + "num_tokens": 813910337.0, + "step": 21333 + }, + { + "epoch": 2.713904083449943, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 2.567481279373169, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8671864867210388, + "num_tokens": 813948342.0, + "step": 21334 + }, + { + "epoch": 2.7140312937285334, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.602311372756958, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8840718269348145, + "num_tokens": 813983851.0, + "step": 21335 + }, + { + "epoch": 2.714158504007124, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4895238876342773, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8840070962905884, + "num_tokens": 814028986.0, + "step": 21336 + }, + { + "epoch": 2.7142857142857144, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6708378791809082, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8709047436714172, + "num_tokens": 814065327.0, + "step": 21337 + }, + { + "epoch": 2.714412924564305, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6360334157943726, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8741952180862427, + "num_tokens": 814104431.0, + "step": 21338 + }, + { + "epoch": 2.7145401348428955, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5389480590820312, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8878061771392822, + "num_tokens": 814143608.0, + "step": 21339 + }, + { + "epoch": 2.7146673451214856, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.798299789428711, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8729861974716187, + "num_tokens": 814176108.0, + "step": 21340 + }, + { + "epoch": 2.7147945554000765, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.573042869567871, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8764572739601135, + "num_tokens": 814219429.0, + "step": 21341 + }, + { + "epoch": 2.7149217656786666, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6585032939910889, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8823629021644592, + "num_tokens": 814258259.0, + "step": 21342 + }, + { + "epoch": 2.7150489759572576, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 3.76774263381958, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8688095808029175, + "num_tokens": 814293097.0, + "step": 21343 + }, + { + "epoch": 2.7151761862358477, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.858824610710144, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8832312226295471, + "num_tokens": 814324854.0, + "step": 21344 + }, + { + "epoch": 2.7153033965144386, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6209660768508911, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8738266229629517, + "num_tokens": 814368474.0, + "step": 21345 + }, + { + "epoch": 2.7154306067930287, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7354564666748047, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8778233528137207, + "num_tokens": 814402946.0, + "step": 21346 + }, + { + "epoch": 2.7155578170716193, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6431077718734741, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8742939233779907, + "num_tokens": 814446565.0, + "step": 21347 + }, + { + "epoch": 2.71568502735021, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5720787048339844, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8819929957389832, + "num_tokens": 814486569.0, + "step": 21348 + }, + { + "epoch": 2.7158122376288003, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8015657663345337, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.865662693977356, + "num_tokens": 814519379.0, + "step": 21349 + }, + { + "epoch": 2.715939447907391, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7664554119110107, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8707656860351562, + "num_tokens": 814560763.0, + "step": 21350 + }, + { + "epoch": 2.7160666581859814, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7730156183242798, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8616796731948853, + "num_tokens": 814594696.0, + "step": 21351 + }, + { + "epoch": 2.716193868464572, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.701581597328186, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8897238969802856, + "num_tokens": 814628526.0, + "step": 21352 + }, + { + "epoch": 2.7163210787431624, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.491595983505249, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8788564205169678, + "num_tokens": 814673034.0, + "step": 21353 + }, + { + "epoch": 2.716448289021753, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.642228603363037, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.873205840587616, + "num_tokens": 814710059.0, + "step": 21354 + }, + { + "epoch": 2.7165754993003435, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.634874701499939, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.875442385673523, + "num_tokens": 814746323.0, + "step": 21355 + }, + { + "epoch": 2.716702709578934, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6903139352798462, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8754692077636719, + "num_tokens": 814782143.0, + "step": 21356 + }, + { + "epoch": 2.7168299198575245, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7325152158737183, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8785600662231445, + "num_tokens": 814816239.0, + "step": 21357 + }, + { + "epoch": 2.716957130136115, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5764875411987305, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8699572086334229, + "num_tokens": 814858772.0, + "step": 21358 + }, + { + "epoch": 2.7170843404147056, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.534153699874878, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8835201263427734, + "num_tokens": 814898241.0, + "step": 21359 + }, + { + "epoch": 2.717211550693296, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6528750658035278, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8725146055221558, + "num_tokens": 814936670.0, + "step": 21360 + }, + { + "epoch": 2.7173387609718866, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7002642154693604, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8769915699958801, + "num_tokens": 814974511.0, + "step": 21361 + }, + { + "epoch": 2.717465971250477, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6976078748703003, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.876953125, + "num_tokens": 815016025.0, + "step": 21362 + }, + { + "epoch": 2.7175931815290677, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.785033941268921, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8558902740478516, + "num_tokens": 815055291.0, + "step": 21363 + }, + { + "epoch": 2.7177203918076582, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5497024059295654, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8846821188926697, + "num_tokens": 815095654.0, + "step": 21364 + }, + { + "epoch": 2.7178476020862483, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5861023664474487, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8916555643081665, + "num_tokens": 815134078.0, + "step": 21365 + }, + { + "epoch": 2.7179748123648393, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4407962560653687, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8782610893249512, + "num_tokens": 815181319.0, + "step": 21366 + }, + { + "epoch": 2.7181020226434294, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6611888408660889, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8789790868759155, + "num_tokens": 815219457.0, + "step": 21367 + }, + { + "epoch": 2.7182292329220203, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6937536001205444, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8663002252578735, + "num_tokens": 815257031.0, + "step": 21368 + }, + { + "epoch": 2.7183564432006104, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4701744318008423, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.8884762525558472, + "num_tokens": 815301884.0, + "step": 21369 + }, + { + "epoch": 2.718483653479201, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.515091896057129, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8661988973617554, + "num_tokens": 815346544.0, + "step": 21370 + }, + { + "epoch": 2.7186108637577915, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.611979365348816, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8778070211410522, + "num_tokens": 815384474.0, + "step": 21371 + }, + { + "epoch": 2.718738074036382, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6124119758605957, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8684312105178833, + "num_tokens": 815425124.0, + "step": 21372 + }, + { + "epoch": 2.7188652843149725, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6949684619903564, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8585435152053833, + "num_tokens": 815465081.0, + "step": 21373 + }, + { + "epoch": 2.718992494593563, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6423544883728027, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8670105338096619, + "num_tokens": 815505000.0, + "step": 21374 + }, + { + "epoch": 2.7191197048721536, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.718695878982544, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8747552633285522, + "num_tokens": 815542274.0, + "step": 21375 + }, + { + "epoch": 2.719246915150744, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7249730825424194, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8839642405509949, + "num_tokens": 815573860.0, + "step": 21376 + }, + { + "epoch": 2.7193741254293347, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6829776763916016, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8556149005889893, + "num_tokens": 815612518.0, + "step": 21377 + }, + { + "epoch": 2.719501335707925, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4740339517593384, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8749058246612549, + "num_tokens": 815658285.0, + "step": 21378 + }, + { + "epoch": 2.7196285459865157, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.674583077430725, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8707493543624878, + "num_tokens": 815693740.0, + "step": 21379 + }, + { + "epoch": 2.7197557562651062, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.632813572883606, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.884936511516571, + "num_tokens": 815731333.0, + "step": 21380 + }, + { + "epoch": 2.7198829665436968, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4922038316726685, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.876338005065918, + "num_tokens": 815776989.0, + "step": 21381 + }, + { + "epoch": 2.7200101768222873, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4613665342330933, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8792980909347534, + "num_tokens": 815824272.0, + "step": 21382 + }, + { + "epoch": 2.720137387100878, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.64969003200531, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8704689741134644, + "num_tokens": 815862959.0, + "step": 21383 + }, + { + "epoch": 2.7202645973794684, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6454979181289673, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8799463510513306, + "num_tokens": 815906727.0, + "step": 21384 + }, + { + "epoch": 2.720391807658059, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.638679027557373, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8543729782104492, + "num_tokens": 815947191.0, + "step": 21385 + }, + { + "epoch": 2.7205190179366494, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6684755086898804, + "learning_rate": 1e-06, + "loss": 0.3215, + "mean_token_accuracy": 0.89701247215271, + "num_tokens": 815983136.0, + "step": 21386 + }, + { + "epoch": 2.72064622821524, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.557381510734558, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8706761598587036, + "num_tokens": 816027877.0, + "step": 21387 + }, + { + "epoch": 2.7207734384938305, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6659315824508667, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8636782765388489, + "num_tokens": 816069398.0, + "step": 21388 + }, + { + "epoch": 2.720900648772421, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.9305707216262817, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8780847787857056, + "num_tokens": 816102993.0, + "step": 21389 + }, + { + "epoch": 2.721027859051011, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5720371007919312, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.883550763130188, + "num_tokens": 816144420.0, + "step": 21390 + }, + { + "epoch": 2.721155069329602, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8677865266799927, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8692628145217896, + "num_tokens": 816175879.0, + "step": 21391 + }, + { + "epoch": 2.721282279608192, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8187532424926758, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8770068287849426, + "num_tokens": 816206067.0, + "step": 21392 + }, + { + "epoch": 2.721409489886783, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8161664009094238, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8661951422691345, + "num_tokens": 816240002.0, + "step": 21393 + }, + { + "epoch": 2.721536700165373, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5454455614089966, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8795727491378784, + "num_tokens": 816283989.0, + "step": 21394 + }, + { + "epoch": 2.7216639104439637, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 3.7682273387908936, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8777892589569092, + "num_tokens": 816322628.0, + "step": 21395 + }, + { + "epoch": 2.7217911207225542, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.573102355003357, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8688156008720398, + "num_tokens": 816363017.0, + "step": 21396 + }, + { + "epoch": 2.7219183310011448, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6450761556625366, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8763324022293091, + "num_tokens": 816398056.0, + "step": 21397 + }, + { + "epoch": 2.7220455412797353, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6034324169158936, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8842942714691162, + "num_tokens": 816435015.0, + "step": 21398 + }, + { + "epoch": 2.722172751558326, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5013253688812256, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.881659209728241, + "num_tokens": 816477800.0, + "step": 21399 + }, + { + "epoch": 2.7222999618369164, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5243101119995117, + "learning_rate": 1e-06, + "loss": 0.3459, + "mean_token_accuracy": 0.888421356678009, + "num_tokens": 816516685.0, + "step": 21400 + }, + { + "epoch": 2.722427172115507, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8057981729507446, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8669811487197876, + "num_tokens": 816553451.0, + "step": 21401 + }, + { + "epoch": 2.7225543823940974, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.784265398979187, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8752011656761169, + "num_tokens": 816583598.0, + "step": 21402 + }, + { + "epoch": 2.722681592672688, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6686317920684814, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8740137815475464, + "num_tokens": 816622605.0, + "step": 21403 + }, + { + "epoch": 2.7228088029512785, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7612775564193726, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8703582286834717, + "num_tokens": 816658100.0, + "step": 21404 + }, + { + "epoch": 2.722936013229869, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7483022212982178, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8858196139335632, + "num_tokens": 816690973.0, + "step": 21405 + }, + { + "epoch": 2.7230632235084595, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6278504133224487, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8820585012435913, + "num_tokens": 816730164.0, + "step": 21406 + }, + { + "epoch": 2.72319043378705, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5312081575393677, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8838709592819214, + "num_tokens": 816770212.0, + "step": 21407 + }, + { + "epoch": 2.7233176440656406, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6364223957061768, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8862614631652832, + "num_tokens": 816804438.0, + "step": 21408 + }, + { + "epoch": 2.723444854344231, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5424853563308716, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8578940629959106, + "num_tokens": 816851126.0, + "step": 21409 + }, + { + "epoch": 2.7235720646228216, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6079888343811035, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8703519105911255, + "num_tokens": 816891440.0, + "step": 21410 + }, + { + "epoch": 2.723699274901412, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5984519720077515, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8654917478561401, + "num_tokens": 816931762.0, + "step": 21411 + }, + { + "epoch": 2.7238264851800027, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.776984691619873, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8689565658569336, + "num_tokens": 816964582.0, + "step": 21412 + }, + { + "epoch": 2.723953695458593, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.718885064125061, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.84773850440979, + "num_tokens": 817008610.0, + "step": 21413 + }, + { + "epoch": 2.7240809057371838, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7361363172531128, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8734990358352661, + "num_tokens": 817044619.0, + "step": 21414 + }, + { + "epoch": 2.724208116015774, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5014209747314453, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8817267417907715, + "num_tokens": 817085688.0, + "step": 21415 + }, + { + "epoch": 2.724335326294365, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6990278959274292, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8815456628799438, + "num_tokens": 817119117.0, + "step": 21416 + }, + { + "epoch": 2.724462536572955, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6394448280334473, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8756781816482544, + "num_tokens": 817153477.0, + "step": 21417 + }, + { + "epoch": 2.724589746851546, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7175933122634888, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8804911971092224, + "num_tokens": 817187082.0, + "step": 21418 + }, + { + "epoch": 2.724716957130136, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8022276163101196, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8770876526832581, + "num_tokens": 817222826.0, + "step": 21419 + }, + { + "epoch": 2.7248441674087265, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7820613384246826, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8761860132217407, + "num_tokens": 817259395.0, + "step": 21420 + }, + { + "epoch": 2.724971377687317, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 3.7131893634796143, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8647283911705017, + "num_tokens": 817296456.0, + "step": 21421 + }, + { + "epoch": 2.7250985879659075, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8084003925323486, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8736969232559204, + "num_tokens": 817330838.0, + "step": 21422 + }, + { + "epoch": 2.725225798244498, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7425453662872314, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.864479124546051, + "num_tokens": 817371088.0, + "step": 21423 + }, + { + "epoch": 2.7253530085230886, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.768396258354187, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8616429567337036, + "num_tokens": 817407060.0, + "step": 21424 + }, + { + "epoch": 2.725480218801679, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4978528022766113, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8724511861801147, + "num_tokens": 817449540.0, + "step": 21425 + }, + { + "epoch": 2.7256074290802697, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6410598754882812, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8689796328544617, + "num_tokens": 817488620.0, + "step": 21426 + }, + { + "epoch": 2.72573463935886, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5028761625289917, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8703629970550537, + "num_tokens": 817531985.0, + "step": 21427 + }, + { + "epoch": 2.7258618496374507, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5893036127090454, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8661686778068542, + "num_tokens": 817569076.0, + "step": 21428 + }, + { + "epoch": 2.7259890599160412, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7779955863952637, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8550105094909668, + "num_tokens": 817600617.0, + "step": 21429 + }, + { + "epoch": 2.7261162701946318, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6195846796035767, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8717435002326965, + "num_tokens": 817639759.0, + "step": 21430 + }, + { + "epoch": 2.7262434804732223, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.75542414188385, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8717307448387146, + "num_tokens": 817677186.0, + "step": 21431 + }, + { + "epoch": 2.726370690751813, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6989705562591553, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8573728203773499, + "num_tokens": 817713329.0, + "step": 21432 + }, + { + "epoch": 2.7264979010304033, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.615952968597412, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.873129665851593, + "num_tokens": 817752037.0, + "step": 21433 + }, + { + "epoch": 2.726625111308994, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7234224081039429, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8650896549224854, + "num_tokens": 817788529.0, + "step": 21434 + }, + { + "epoch": 2.7267523215875844, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5968037843704224, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8741291761398315, + "num_tokens": 817828328.0, + "step": 21435 + }, + { + "epoch": 2.726879531866175, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.687909483909607, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8728563785552979, + "num_tokens": 817868182.0, + "step": 21436 + }, + { + "epoch": 2.7270067421447655, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.702822208404541, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8689420223236084, + "num_tokens": 817907701.0, + "step": 21437 + }, + { + "epoch": 2.7271339524233555, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.484950304031372, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8871287703514099, + "num_tokens": 817949582.0, + "step": 21438 + }, + { + "epoch": 2.7272611627019465, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6863497495651245, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8693187236785889, + "num_tokens": 817984830.0, + "step": 21439 + }, + { + "epoch": 2.7273883729805366, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.580301284790039, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8858066201210022, + "num_tokens": 818022130.0, + "step": 21440 + }, + { + "epoch": 2.7275155832591276, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5186669826507568, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8888055086135864, + "num_tokens": 818061414.0, + "step": 21441 + }, + { + "epoch": 2.7276427935377177, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7079416513442993, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8605862855911255, + "num_tokens": 818097049.0, + "step": 21442 + }, + { + "epoch": 2.7277700038163086, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.60326087474823, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8866486549377441, + "num_tokens": 818131229.0, + "step": 21443 + }, + { + "epoch": 2.7278972140948987, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5820258855819702, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8625888824462891, + "num_tokens": 818172726.0, + "step": 21444 + }, + { + "epoch": 2.7280244243734892, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6031383275985718, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8863707184791565, + "num_tokens": 818211243.0, + "step": 21445 + }, + { + "epoch": 2.7281516346520798, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6682507991790771, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8693649172782898, + "num_tokens": 818247467.0, + "step": 21446 + }, + { + "epoch": 2.7282788449306703, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6639999151229858, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8652080297470093, + "num_tokens": 818286164.0, + "step": 21447 + }, + { + "epoch": 2.728406055209261, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5395278930664062, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.873103141784668, + "num_tokens": 818326374.0, + "step": 21448 + }, + { + "epoch": 2.7285332654878514, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6453009843826294, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8802959322929382, + "num_tokens": 818362857.0, + "step": 21449 + }, + { + "epoch": 2.728660475766442, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5471584796905518, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8673205971717834, + "num_tokens": 818409379.0, + "step": 21450 + }, + { + "epoch": 2.7287876860450324, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6353033781051636, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8746355772018433, + "num_tokens": 818445819.0, + "step": 21451 + }, + { + "epoch": 2.728914896323623, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6787077188491821, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8836125135421753, + "num_tokens": 818478345.0, + "step": 21452 + }, + { + "epoch": 2.7290421066022135, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6725142002105713, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8808029890060425, + "num_tokens": 818512210.0, + "step": 21453 + }, + { + "epoch": 2.729169316880804, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.572969675064087, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8800358772277832, + "num_tokens": 818551066.0, + "step": 21454 + }, + { + "epoch": 2.7292965271593945, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.564640760421753, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8826775550842285, + "num_tokens": 818590471.0, + "step": 21455 + }, + { + "epoch": 2.729423737437985, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6413154602050781, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8664640784263611, + "num_tokens": 818629628.0, + "step": 21456 + }, + { + "epoch": 2.7295509477165756, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7587608098983765, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8523728847503662, + "num_tokens": 818674926.0, + "step": 21457 + }, + { + "epoch": 2.729678157995166, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5862140655517578, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8745695948600769, + "num_tokens": 818713612.0, + "step": 21458 + }, + { + "epoch": 2.7298053682737566, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5328706502914429, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8733800649642944, + "num_tokens": 818755652.0, + "step": 21459 + }, + { + "epoch": 2.729932578552347, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6452064514160156, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8716776371002197, + "num_tokens": 818796195.0, + "step": 21460 + }, + { + "epoch": 2.7300597888309377, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 16.84099769592285, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8675352334976196, + "num_tokens": 818832653.0, + "step": 21461 + }, + { + "epoch": 2.7301869991095282, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.9050005674362183, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8672902584075928, + "num_tokens": 818866755.0, + "step": 21462 + }, + { + "epoch": 2.7303142093881183, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.819980263710022, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8696401119232178, + "num_tokens": 818905601.0, + "step": 21463 + }, + { + "epoch": 2.7304414196667093, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7303926944732666, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8737313151359558, + "num_tokens": 818942058.0, + "step": 21464 + }, + { + "epoch": 2.7305686299452994, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.602822184562683, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8845583200454712, + "num_tokens": 818980233.0, + "step": 21465 + }, + { + "epoch": 2.7306958402238903, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.537293791770935, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8699348568916321, + "num_tokens": 819022521.0, + "step": 21466 + }, + { + "epoch": 2.7308230505024804, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7014224529266357, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8911710977554321, + "num_tokens": 819060289.0, + "step": 21467 + }, + { + "epoch": 2.730950260781071, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7792961597442627, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8686962127685547, + "num_tokens": 819095971.0, + "step": 21468 + }, + { + "epoch": 2.7310774710596615, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6112712621688843, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8829342126846313, + "num_tokens": 819133922.0, + "step": 21469 + }, + { + "epoch": 2.731204681338252, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5557353496551514, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8769006729125977, + "num_tokens": 819176491.0, + "step": 21470 + }, + { + "epoch": 2.7313318916168425, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6448748111724854, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.857089638710022, + "num_tokens": 819219115.0, + "step": 21471 + }, + { + "epoch": 2.731459101895433, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5592315196990967, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.871642529964447, + "num_tokens": 819263583.0, + "step": 21472 + }, + { + "epoch": 2.7315863121740236, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7336158752441406, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8606919050216675, + "num_tokens": 819301210.0, + "step": 21473 + }, + { + "epoch": 2.731713522452614, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7070435285568237, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8619769811630249, + "num_tokens": 819337889.0, + "step": 21474 + }, + { + "epoch": 2.7318407327312046, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6614902019500732, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8752596974372864, + "num_tokens": 819372559.0, + "step": 21475 + }, + { + "epoch": 2.731967943009795, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.63254976272583, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8705374002456665, + "num_tokens": 819410949.0, + "step": 21476 + }, + { + "epoch": 2.7320951532883857, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.585876703262329, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8869218826293945, + "num_tokens": 819448541.0, + "step": 21477 + }, + { + "epoch": 2.7322223635669762, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6129133701324463, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8647242784500122, + "num_tokens": 819488310.0, + "step": 21478 + }, + { + "epoch": 2.7323495738455668, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5619699954986572, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8509172201156616, + "num_tokens": 819531182.0, + "step": 21479 + }, + { + "epoch": 2.7324767841241573, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.555051326751709, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.868574857711792, + "num_tokens": 819573185.0, + "step": 21480 + }, + { + "epoch": 2.732603994402748, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.645591378211975, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8742491006851196, + "num_tokens": 819610197.0, + "step": 21481 + }, + { + "epoch": 2.7327312046813383, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.624420166015625, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8710812330245972, + "num_tokens": 819648754.0, + "step": 21482 + }, + { + "epoch": 2.732858414959929, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.589548110961914, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8793182373046875, + "num_tokens": 819685886.0, + "step": 21483 + }, + { + "epoch": 2.7329856252385194, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5275609493255615, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8709825873374939, + "num_tokens": 819728348.0, + "step": 21484 + }, + { + "epoch": 2.73311283551711, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5716043710708618, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8713109493255615, + "num_tokens": 819768311.0, + "step": 21485 + }, + { + "epoch": 2.7332400457957005, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5438560247421265, + "learning_rate": 1e-06, + "loss": 0.3389, + "mean_token_accuracy": 0.892827570438385, + "num_tokens": 819808250.0, + "step": 21486 + }, + { + "epoch": 2.733367256074291, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.674691081047058, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8531733751296997, + "num_tokens": 819851121.0, + "step": 21487 + }, + { + "epoch": 2.733494466352881, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.54654061794281, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8811396360397339, + "num_tokens": 819889152.0, + "step": 21488 + }, + { + "epoch": 2.733621676631472, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5381641387939453, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8704529404640198, + "num_tokens": 819931907.0, + "step": 21489 + }, + { + "epoch": 2.733748886910062, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4527802467346191, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8717435002326965, + "num_tokens": 819979446.0, + "step": 21490 + }, + { + "epoch": 2.733876097188653, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6859747171401978, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8833010792732239, + "num_tokens": 820017767.0, + "step": 21491 + }, + { + "epoch": 2.734003307467243, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5336672067642212, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8819644451141357, + "num_tokens": 820057877.0, + "step": 21492 + }, + { + "epoch": 2.7341305177458337, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7725911140441895, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8720211982727051, + "num_tokens": 820094793.0, + "step": 21493 + }, + { + "epoch": 2.7342577280244242, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5664393901824951, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8797621726989746, + "num_tokens": 820135289.0, + "step": 21494 + }, + { + "epoch": 2.7343849383030148, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5737409591674805, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8756023645401001, + "num_tokens": 820180075.0, + "step": 21495 + }, + { + "epoch": 2.7345121485816053, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7632901668548584, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8780527710914612, + "num_tokens": 820216813.0, + "step": 21496 + }, + { + "epoch": 2.734639358860196, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7731609344482422, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8812974691390991, + "num_tokens": 820250311.0, + "step": 21497 + }, + { + "epoch": 2.7347665691387864, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.549277663230896, + "learning_rate": 1e-06, + "loss": 0.3393, + "mean_token_accuracy": 0.8919516801834106, + "num_tokens": 820289001.0, + "step": 21498 + }, + { + "epoch": 2.734893779417377, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.605570673942566, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8695545792579651, + "num_tokens": 820324708.0, + "step": 21499 + }, + { + "epoch": 2.7350209896959674, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7123570442199707, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8744776844978333, + "num_tokens": 820361673.0, + "step": 21500 + }, + { + "epoch": 2.735148199974558, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6563667058944702, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.865251898765564, + "num_tokens": 820399875.0, + "step": 21501 + }, + { + "epoch": 2.7352754102531485, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6312679052352905, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8773291707038879, + "num_tokens": 820437333.0, + "step": 21502 + }, + { + "epoch": 2.735402620531739, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7279438972473145, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.852097749710083, + "num_tokens": 820473912.0, + "step": 21503 + }, + { + "epoch": 2.7355298308103295, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6505812406539917, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8742948770523071, + "num_tokens": 820511212.0, + "step": 21504 + }, + { + "epoch": 2.73565704108892, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.680910348892212, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8729323744773865, + "num_tokens": 820545738.0, + "step": 21505 + }, + { + "epoch": 2.7357842513675106, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7395991086959839, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8779129981994629, + "num_tokens": 820580554.0, + "step": 21506 + }, + { + "epoch": 2.735911461646101, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5570265054702759, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.875939130783081, + "num_tokens": 820618971.0, + "step": 21507 + }, + { + "epoch": 2.7360386719246916, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7039411067962646, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8787781596183777, + "num_tokens": 820658043.0, + "step": 21508 + }, + { + "epoch": 2.736165882203282, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5712276697158813, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8726072907447815, + "num_tokens": 820697317.0, + "step": 21509 + }, + { + "epoch": 2.7362930924818727, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6542402505874634, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8808969259262085, + "num_tokens": 820731787.0, + "step": 21510 + }, + { + "epoch": 2.7364203027604628, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6899235248565674, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.875311017036438, + "num_tokens": 820773448.0, + "step": 21511 + }, + { + "epoch": 2.7365475130390537, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4754940271377563, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8769729137420654, + "num_tokens": 820821146.0, + "step": 21512 + }, + { + "epoch": 2.736674723317644, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.590508222579956, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8863526582717896, + "num_tokens": 820861787.0, + "step": 21513 + }, + { + "epoch": 2.736801933596235, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5138028860092163, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8709903955459595, + "num_tokens": 820906990.0, + "step": 21514 + }, + { + "epoch": 2.736929143874825, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5924677848815918, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8807415962219238, + "num_tokens": 820944059.0, + "step": 21515 + }, + { + "epoch": 2.737056354153416, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6688268184661865, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8664889335632324, + "num_tokens": 820983264.0, + "step": 21516 + }, + { + "epoch": 2.737183564432006, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6483339071273804, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8813155889511108, + "num_tokens": 821019834.0, + "step": 21517 + }, + { + "epoch": 2.7373107747105965, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6735646724700928, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8720701932907104, + "num_tokens": 821053568.0, + "step": 21518 + }, + { + "epoch": 2.737437984989187, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6413328647613525, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.87367844581604, + "num_tokens": 821089819.0, + "step": 21519 + }, + { + "epoch": 2.7375651952677775, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6630414724349976, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8703380823135376, + "num_tokens": 821127330.0, + "step": 21520 + }, + { + "epoch": 2.737692405546368, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6917634010314941, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8769506216049194, + "num_tokens": 821163451.0, + "step": 21521 + }, + { + "epoch": 2.7378196158249586, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.697581171989441, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8711774945259094, + "num_tokens": 821198719.0, + "step": 21522 + }, + { + "epoch": 2.737946826103549, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.528975248336792, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8689190149307251, + "num_tokens": 821243594.0, + "step": 21523 + }, + { + "epoch": 2.7380740363821396, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6506755352020264, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.884850025177002, + "num_tokens": 821279602.0, + "step": 21524 + }, + { + "epoch": 2.73820124666073, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5939851999282837, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8860325813293457, + "num_tokens": 821322046.0, + "step": 21525 + }, + { + "epoch": 2.7383284569393207, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.627322793006897, + "learning_rate": 1e-06, + "loss": 0.3401, + "mean_token_accuracy": 0.8856598138809204, + "num_tokens": 821357598.0, + "step": 21526 + }, + { + "epoch": 2.7384556672179112, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7316428422927856, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8680396676063538, + "num_tokens": 821391315.0, + "step": 21527 + }, + { + "epoch": 2.7385828774965018, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6235431432724, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8654237985610962, + "num_tokens": 821433868.0, + "step": 21528 + }, + { + "epoch": 2.7387100877750923, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.538580060005188, + "learning_rate": 1e-06, + "loss": 0.341, + "mean_token_accuracy": 0.8897990584373474, + "num_tokens": 821477520.0, + "step": 21529 + }, + { + "epoch": 2.738837298053683, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8813788890838623, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8619573712348938, + "num_tokens": 821509705.0, + "step": 21530 + }, + { + "epoch": 2.7389645083322733, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.616160273551941, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8656955361366272, + "num_tokens": 821549228.0, + "step": 21531 + }, + { + "epoch": 2.739091718610864, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6347248554229736, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8688713312149048, + "num_tokens": 821586872.0, + "step": 21532 + }, + { + "epoch": 2.7392189288894544, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7348732948303223, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8720414042472839, + "num_tokens": 821621699.0, + "step": 21533 + }, + { + "epoch": 2.739346139168045, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.523999571800232, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8782556653022766, + "num_tokens": 821663182.0, + "step": 21534 + }, + { + "epoch": 2.7394733494466355, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5846691131591797, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8879320621490479, + "num_tokens": 821700127.0, + "step": 21535 + }, + { + "epoch": 2.7396005597252255, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5973325967788696, + "learning_rate": 1e-06, + "loss": 0.3247, + "mean_token_accuracy": 0.8933899998664856, + "num_tokens": 821734664.0, + "step": 21536 + }, + { + "epoch": 2.7397277700038165, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6754727363586426, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8741320371627808, + "num_tokens": 821770868.0, + "step": 21537 + }, + { + "epoch": 2.7398549802824066, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.57291841506958, + "learning_rate": 1e-06, + "loss": 0.3383, + "mean_token_accuracy": 0.8901615142822266, + "num_tokens": 821812041.0, + "step": 21538 + }, + { + "epoch": 2.7399821905609976, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5166150331497192, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8829405307769775, + "num_tokens": 821851268.0, + "step": 21539 + }, + { + "epoch": 2.7401094008395877, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6285218000411987, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8763286471366882, + "num_tokens": 821886863.0, + "step": 21540 + }, + { + "epoch": 2.7402366111181786, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6218860149383545, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8742474913597107, + "num_tokens": 821925215.0, + "step": 21541 + }, + { + "epoch": 2.7403638213967687, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 2.7180862426757812, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8613979816436768, + "num_tokens": 821964519.0, + "step": 21542 + }, + { + "epoch": 2.7404910316753592, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5833263397216797, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8756664395332336, + "num_tokens": 822001057.0, + "step": 21543 + }, + { + "epoch": 2.7406182419539498, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6841294765472412, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8642175197601318, + "num_tokens": 822039307.0, + "step": 21544 + }, + { + "epoch": 2.7407454522325403, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7882128953933716, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8831803202629089, + "num_tokens": 822073489.0, + "step": 21545 + }, + { + "epoch": 2.740872662511131, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6520694494247437, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8701220154762268, + "num_tokens": 822110224.0, + "step": 21546 + }, + { + "epoch": 2.7409998727897213, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6409028768539429, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8692501783370972, + "num_tokens": 822148875.0, + "step": 21547 + }, + { + "epoch": 2.741127083068312, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6664535999298096, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8732399940490723, + "num_tokens": 822186758.0, + "step": 21548 + }, + { + "epoch": 2.7412542933469024, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.625192642211914, + "learning_rate": 1e-06, + "loss": 0.3305, + "mean_token_accuracy": 0.8915770649909973, + "num_tokens": 822223329.0, + "step": 21549 + }, + { + "epoch": 2.741381503625493, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6367429494857788, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8756029009819031, + "num_tokens": 822259460.0, + "step": 21550 + }, + { + "epoch": 2.7415087139040835, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7190525531768799, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8771442174911499, + "num_tokens": 822291429.0, + "step": 21551 + }, + { + "epoch": 2.741635924182674, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6978579759597778, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8854701519012451, + "num_tokens": 822325506.0, + "step": 21552 + }, + { + "epoch": 2.7417631344612645, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8427197933197021, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.875146746635437, + "num_tokens": 822356868.0, + "step": 21553 + }, + { + "epoch": 2.741890344739855, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7390692234039307, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8662607073783875, + "num_tokens": 822391184.0, + "step": 21554 + }, + { + "epoch": 2.7420175550184456, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6599844694137573, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8804808259010315, + "num_tokens": 822427040.0, + "step": 21555 + }, + { + "epoch": 2.742144765297036, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7111939191818237, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8768044114112854, + "num_tokens": 822460750.0, + "step": 21556 + }, + { + "epoch": 2.7422719755756266, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7406855821609497, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8893104791641235, + "num_tokens": 822490585.0, + "step": 21557 + }, + { + "epoch": 2.742399185854217, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6137380599975586, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8671031594276428, + "num_tokens": 822529048.0, + "step": 21558 + }, + { + "epoch": 2.7425263961328077, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7384121417999268, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8671765327453613, + "num_tokens": 822563915.0, + "step": 21559 + }, + { + "epoch": 2.742653606411398, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6360677480697632, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8734413981437683, + "num_tokens": 822600793.0, + "step": 21560 + }, + { + "epoch": 2.7427808166899883, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6710774898529053, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8793428540229797, + "num_tokens": 822633586.0, + "step": 21561 + }, + { + "epoch": 2.7429080269685793, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5289764404296875, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.880042552947998, + "num_tokens": 822671634.0, + "step": 21562 + }, + { + "epoch": 2.7430352372471694, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6449750661849976, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8499542474746704, + "num_tokens": 822711438.0, + "step": 21563 + }, + { + "epoch": 2.7431624475257603, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8188749551773071, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8721306324005127, + "num_tokens": 822743319.0, + "step": 21564 + }, + { + "epoch": 2.7432896578043504, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7224290370941162, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8716106414794922, + "num_tokens": 822778130.0, + "step": 21565 + }, + { + "epoch": 2.743416868082941, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6761441230773926, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8661269545555115, + "num_tokens": 822813281.0, + "step": 21566 + }, + { + "epoch": 2.7435440783615315, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7517462968826294, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8825531005859375, + "num_tokens": 822846421.0, + "step": 21567 + }, + { + "epoch": 2.743671288640122, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6753469705581665, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8809467554092407, + "num_tokens": 822881905.0, + "step": 21568 + }, + { + "epoch": 2.7437984989187125, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7018393278121948, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8814393281936646, + "num_tokens": 822914659.0, + "step": 21569 + }, + { + "epoch": 2.743925709197303, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7521716356277466, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8748399019241333, + "num_tokens": 822949476.0, + "step": 21570 + }, + { + "epoch": 2.7440529194758936, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6347155570983887, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8776029348373413, + "num_tokens": 822986418.0, + "step": 21571 + }, + { + "epoch": 2.744180129754484, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.622549295425415, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8786546587944031, + "num_tokens": 823025532.0, + "step": 21572 + }, + { + "epoch": 2.7443073400330746, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7083994150161743, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8741070628166199, + "num_tokens": 823062874.0, + "step": 21573 + }, + { + "epoch": 2.744434550311665, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5338034629821777, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8916937112808228, + "num_tokens": 823100977.0, + "step": 21574 + }, + { + "epoch": 2.7445617605902557, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.688239336013794, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8621910214424133, + "num_tokens": 823135951.0, + "step": 21575 + }, + { + "epoch": 2.7446889708688462, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7518291473388672, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8691405057907104, + "num_tokens": 823168524.0, + "step": 21576 + }, + { + "epoch": 2.7448161811474368, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6534185409545898, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8706015348434448, + "num_tokens": 823207594.0, + "step": 21577 + }, + { + "epoch": 2.7449433914260273, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6996229887008667, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8724849820137024, + "num_tokens": 823243520.0, + "step": 21578 + }, + { + "epoch": 2.745070601704618, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6061798334121704, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8702707886695862, + "num_tokens": 823282422.0, + "step": 21579 + }, + { + "epoch": 2.7451978119832083, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6483283042907715, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8750150203704834, + "num_tokens": 823319654.0, + "step": 21580 + }, + { + "epoch": 2.745325022261799, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6426854133605957, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8762557506561279, + "num_tokens": 823353790.0, + "step": 21581 + }, + { + "epoch": 2.7454522325403894, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6433452367782593, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8726354837417603, + "num_tokens": 823389057.0, + "step": 21582 + }, + { + "epoch": 2.74557944281898, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6313045024871826, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.873371958732605, + "num_tokens": 823430285.0, + "step": 21583 + }, + { + "epoch": 2.7457066530975704, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7765789031982422, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8783866167068481, + "num_tokens": 823463216.0, + "step": 21584 + }, + { + "epoch": 2.745833863376161, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6679824590682983, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8639816641807556, + "num_tokens": 823503402.0, + "step": 21585 + }, + { + "epoch": 2.745961073654751, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4973621368408203, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8831797242164612, + "num_tokens": 823544129.0, + "step": 21586 + }, + { + "epoch": 2.746088283933342, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6058952808380127, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8778706789016724, + "num_tokens": 823581836.0, + "step": 21587 + }, + { + "epoch": 2.746215494211932, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7328884601593018, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8757340312004089, + "num_tokens": 823617343.0, + "step": 21588 + }, + { + "epoch": 2.746342704490523, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6048473119735718, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8753843307495117, + "num_tokens": 823658287.0, + "step": 21589 + }, + { + "epoch": 2.746469914769113, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5466465950012207, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8765937685966492, + "num_tokens": 823696282.0, + "step": 21590 + }, + { + "epoch": 2.7465971250477037, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7052065134048462, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8700661659240723, + "num_tokens": 823733904.0, + "step": 21591 + }, + { + "epoch": 2.7467243353262942, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.647058367729187, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8734824061393738, + "num_tokens": 823775484.0, + "step": 21592 + }, + { + "epoch": 2.7468515456048848, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6822686195373535, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8851533532142639, + "num_tokens": 823811524.0, + "step": 21593 + }, + { + "epoch": 2.7469787558834753, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6247667074203491, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8712337613105774, + "num_tokens": 823850054.0, + "step": 21594 + }, + { + "epoch": 2.747105966162066, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6572786569595337, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8799077272415161, + "num_tokens": 823889214.0, + "step": 21595 + }, + { + "epoch": 2.7472331764406563, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6652350425720215, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8680925369262695, + "num_tokens": 823927947.0, + "step": 21596 + }, + { + "epoch": 2.747360386719247, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.758641242980957, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8751906156539917, + "num_tokens": 823963771.0, + "step": 21597 + }, + { + "epoch": 2.7474875969978374, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4349708557128906, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8942998647689819, + "num_tokens": 824008418.0, + "step": 21598 + }, + { + "epoch": 2.747614807276428, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7595925331115723, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.878092348575592, + "num_tokens": 824041736.0, + "step": 21599 + }, + { + "epoch": 2.7477420175550185, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5180580615997314, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8746746182441711, + "num_tokens": 824086948.0, + "step": 21600 + }, + { + "epoch": 2.747869227833609, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6406391859054565, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8841047883033752, + "num_tokens": 824123170.0, + "step": 21601 + }, + { + "epoch": 2.7479964381121995, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6899288892745972, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8793277740478516, + "num_tokens": 824158989.0, + "step": 21602 + }, + { + "epoch": 2.74812364839079, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6230381727218628, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8827451467514038, + "num_tokens": 824197723.0, + "step": 21603 + }, + { + "epoch": 2.7482508586693806, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.738054633140564, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8817623853683472, + "num_tokens": 824232460.0, + "step": 21604 + }, + { + "epoch": 2.748378068947971, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6379865407943726, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8689762353897095, + "num_tokens": 824278638.0, + "step": 21605 + }, + { + "epoch": 2.7485052792265616, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6877470016479492, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8837632536888123, + "num_tokens": 824312288.0, + "step": 21606 + }, + { + "epoch": 2.748632489505152, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6057145595550537, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8738778233528137, + "num_tokens": 824350112.0, + "step": 21607 + }, + { + "epoch": 2.7487596997837427, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7743175029754639, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8757312893867493, + "num_tokens": 824384587.0, + "step": 21608 + }, + { + "epoch": 2.7488869100623328, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5346990823745728, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8716937303543091, + "num_tokens": 824428726.0, + "step": 21609 + }, + { + "epoch": 2.7490141203409237, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.567460060119629, + "learning_rate": 1e-06, + "loss": 0.3274, + "mean_token_accuracy": 0.8913947343826294, + "num_tokens": 824467960.0, + "step": 21610 + }, + { + "epoch": 2.749141330619514, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6325839757919312, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8845288753509521, + "num_tokens": 824504416.0, + "step": 21611 + }, + { + "epoch": 2.749268540898105, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7689327001571655, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8655756711959839, + "num_tokens": 824538068.0, + "step": 21612 + }, + { + "epoch": 2.749395751176695, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7209564447402954, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.87971431016922, + "num_tokens": 824575809.0, + "step": 21613 + }, + { + "epoch": 2.749522961455286, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5642441511154175, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8814588785171509, + "num_tokens": 824615423.0, + "step": 21614 + }, + { + "epoch": 2.749650171733876, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6679743528366089, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8770200610160828, + "num_tokens": 824651278.0, + "step": 21615 + }, + { + "epoch": 2.7497773820124665, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6526964902877808, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8783948421478271, + "num_tokens": 824689674.0, + "step": 21616 + }, + { + "epoch": 2.749904592291057, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6006104946136475, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8686227798461914, + "num_tokens": 824727711.0, + "step": 21617 + }, + { + "epoch": 2.7500318025696475, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.53396737575531, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.886527419090271, + "num_tokens": 824766959.0, + "step": 21618 + }, + { + "epoch": 2.750159012848238, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5113232135772705, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.879082441329956, + "num_tokens": 824812140.0, + "step": 21619 + }, + { + "epoch": 2.7502862231268286, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5574496984481812, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8835313320159912, + "num_tokens": 824850879.0, + "step": 21620 + }, + { + "epoch": 2.750413433405419, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.650267481803894, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8832886815071106, + "num_tokens": 824887224.0, + "step": 21621 + }, + { + "epoch": 2.7505406436840096, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.94465172290802, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8773630857467651, + "num_tokens": 824925284.0, + "step": 21622 + }, + { + "epoch": 2.7506678539626, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5461031198501587, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8685166835784912, + "num_tokens": 824969193.0, + "step": 21623 + }, + { + "epoch": 2.7507950642411907, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.788101315498352, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8525861501693726, + "num_tokens": 825008053.0, + "step": 21624 + }, + { + "epoch": 2.750922274519781, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.648666501045227, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8660166263580322, + "num_tokens": 825046534.0, + "step": 21625 + }, + { + "epoch": 2.7510494847983717, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6805180311203003, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8747481107711792, + "num_tokens": 825081025.0, + "step": 21626 + }, + { + "epoch": 2.7511766950769623, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7737187147140503, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8634472489356995, + "num_tokens": 825117084.0, + "step": 21627 + }, + { + "epoch": 2.751303905355553, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6602798700332642, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8781489729881287, + "num_tokens": 825157235.0, + "step": 21628 + }, + { + "epoch": 2.7514311156341433, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5946462154388428, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.867254912853241, + "num_tokens": 825198555.0, + "step": 21629 + }, + { + "epoch": 2.751558325912734, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.690828800201416, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8741263151168823, + "num_tokens": 825238411.0, + "step": 21630 + }, + { + "epoch": 2.7516855361913244, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5873053073883057, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8696082234382629, + "num_tokens": 825280674.0, + "step": 21631 + }, + { + "epoch": 2.751812746469915, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6827342510223389, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8662400245666504, + "num_tokens": 825319062.0, + "step": 21632 + }, + { + "epoch": 2.7519399567485054, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4959790706634521, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8796945810317993, + "num_tokens": 825360358.0, + "step": 21633 + }, + { + "epoch": 2.7520671670270955, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6298223733901978, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8648806214332581, + "num_tokens": 825403727.0, + "step": 21634 + }, + { + "epoch": 2.7521943773056865, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6779953241348267, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.864214301109314, + "num_tokens": 825441798.0, + "step": 21635 + }, + { + "epoch": 2.7523215875842766, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5449644327163696, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.882523238658905, + "num_tokens": 825481455.0, + "step": 21636 + }, + { + "epoch": 2.7524487978628676, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6274282932281494, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8807621002197266, + "num_tokens": 825519125.0, + "step": 21637 + }, + { + "epoch": 2.7525760081414576, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7043135166168213, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8814614415168762, + "num_tokens": 825555472.0, + "step": 21638 + }, + { + "epoch": 2.7527032184200486, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6167478561401367, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8839580416679382, + "num_tokens": 825592107.0, + "step": 21639 + }, + { + "epoch": 2.7528304286986387, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5374327898025513, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8648727536201477, + "num_tokens": 825635689.0, + "step": 21640 + }, + { + "epoch": 2.7529576389772292, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.542837142944336, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8828279376029968, + "num_tokens": 825673261.0, + "step": 21641 + }, + { + "epoch": 2.7530848492558198, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.590926170349121, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8860591650009155, + "num_tokens": 825715368.0, + "step": 21642 + }, + { + "epoch": 2.7532120595344103, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5419285297393799, + "learning_rate": 1e-06, + "loss": 0.3501, + "mean_token_accuracy": 0.8861262798309326, + "num_tokens": 825756511.0, + "step": 21643 + }, + { + "epoch": 2.753339269813001, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6895378828048706, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8630826473236084, + "num_tokens": 825793768.0, + "step": 21644 + }, + { + "epoch": 2.7534664800915913, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8187609910964966, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8589307069778442, + "num_tokens": 825833587.0, + "step": 21645 + }, + { + "epoch": 2.753593690370182, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5896981954574585, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.868340790271759, + "num_tokens": 825873916.0, + "step": 21646 + }, + { + "epoch": 2.7537209006487724, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5075416564941406, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8809492588043213, + "num_tokens": 825916373.0, + "step": 21647 + }, + { + "epoch": 2.753848110927363, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5989394187927246, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8624520301818848, + "num_tokens": 825955040.0, + "step": 21648 + }, + { + "epoch": 2.7539753212059535, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6783839464187622, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8722842335700989, + "num_tokens": 825988254.0, + "step": 21649 + }, + { + "epoch": 2.754102531484544, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6609262228012085, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8771771192550659, + "num_tokens": 826025721.0, + "step": 21650 + }, + { + "epoch": 2.7542297417631345, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5847362279891968, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.881393313407898, + "num_tokens": 826065868.0, + "step": 21651 + }, + { + "epoch": 2.754356952041725, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6749285459518433, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8795100450515747, + "num_tokens": 826104977.0, + "step": 21652 + }, + { + "epoch": 2.7544841623203156, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8616161346435547, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8745285272598267, + "num_tokens": 826133824.0, + "step": 21653 + }, + { + "epoch": 2.754611372598906, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6402088403701782, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8730694055557251, + "num_tokens": 826172454.0, + "step": 21654 + }, + { + "epoch": 2.7547385828774966, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5524407625198364, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.869329571723938, + "num_tokens": 826217530.0, + "step": 21655 + }, + { + "epoch": 2.754865793156087, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6449564695358276, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8707051277160645, + "num_tokens": 826257290.0, + "step": 21656 + }, + { + "epoch": 2.7549930034346777, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6424610614776611, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8597635626792908, + "num_tokens": 826297166.0, + "step": 21657 + }, + { + "epoch": 2.755120213713268, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.523538589477539, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8796888589859009, + "num_tokens": 826341053.0, + "step": 21658 + }, + { + "epoch": 2.7552474239918583, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.712868332862854, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8659064769744873, + "num_tokens": 826380368.0, + "step": 21659 + }, + { + "epoch": 2.7553746342704493, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5190346240997314, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8861877918243408, + "num_tokens": 826421405.0, + "step": 21660 + }, + { + "epoch": 2.7555018445490393, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7255102396011353, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.885196328163147, + "num_tokens": 826458352.0, + "step": 21661 + }, + { + "epoch": 2.7556290548276303, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6385613679885864, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.878008246421814, + "num_tokens": 826495697.0, + "step": 21662 + }, + { + "epoch": 2.7557562651062204, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.597090482711792, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8784838914871216, + "num_tokens": 826536474.0, + "step": 21663 + }, + { + "epoch": 2.755883475384811, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5379908084869385, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8816854953765869, + "num_tokens": 826577042.0, + "step": 21664 + }, + { + "epoch": 2.7560106856634015, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5881415605545044, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8667842149734497, + "num_tokens": 826618238.0, + "step": 21665 + }, + { + "epoch": 2.756137895941992, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.696012020111084, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8764725923538208, + "num_tokens": 826655374.0, + "step": 21666 + }, + { + "epoch": 2.7562651062205825, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5547784566879272, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8763573169708252, + "num_tokens": 826694894.0, + "step": 21667 + }, + { + "epoch": 2.756392316499173, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6624385118484497, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8723024725914001, + "num_tokens": 826730977.0, + "step": 21668 + }, + { + "epoch": 2.7565195267777636, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6390626430511475, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8779182434082031, + "num_tokens": 826769105.0, + "step": 21669 + }, + { + "epoch": 2.756646737056354, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7425116300582886, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8776516914367676, + "num_tokens": 826810170.0, + "step": 21670 + }, + { + "epoch": 2.7567739473349446, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6025359630584717, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8757612109184265, + "num_tokens": 826847239.0, + "step": 21671 + }, + { + "epoch": 2.756901157613535, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7601583003997803, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8724753260612488, + "num_tokens": 826881591.0, + "step": 21672 + }, + { + "epoch": 2.7570283678921257, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6611013412475586, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8696091175079346, + "num_tokens": 826916129.0, + "step": 21673 + }, + { + "epoch": 2.757155578170716, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5755879878997803, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8816121816635132, + "num_tokens": 826952386.0, + "step": 21674 + }, + { + "epoch": 2.7572827884493067, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5193623304367065, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8806746006011963, + "num_tokens": 826992343.0, + "step": 21675 + }, + { + "epoch": 2.7574099987278973, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5831338167190552, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.880258321762085, + "num_tokens": 827033992.0, + "step": 21676 + }, + { + "epoch": 2.757537209006488, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6488603353500366, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8803694248199463, + "num_tokens": 827071228.0, + "step": 21677 + }, + { + "epoch": 2.7576644192850783, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.514021396636963, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.890490710735321, + "num_tokens": 827110258.0, + "step": 21678 + }, + { + "epoch": 2.757791629563669, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6803549528121948, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8794618844985962, + "num_tokens": 827143949.0, + "step": 21679 + }, + { + "epoch": 2.7579188398422594, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6512219905853271, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8807085752487183, + "num_tokens": 827178445.0, + "step": 21680 + }, + { + "epoch": 2.75804605012085, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.624483585357666, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8856416344642639, + "num_tokens": 827216933.0, + "step": 21681 + }, + { + "epoch": 2.7581732603994404, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.9135578870773315, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8666418790817261, + "num_tokens": 827251581.0, + "step": 21682 + }, + { + "epoch": 2.758300470678031, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7412147521972656, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8748247623443604, + "num_tokens": 827284627.0, + "step": 21683 + }, + { + "epoch": 2.758427680956621, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7042924165725708, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8495098352432251, + "num_tokens": 827323080.0, + "step": 21684 + }, + { + "epoch": 2.758554891235212, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5988268852233887, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8645536303520203, + "num_tokens": 827363311.0, + "step": 21685 + }, + { + "epoch": 2.758682101513802, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5571452379226685, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8855988383293152, + "num_tokens": 827400546.0, + "step": 21686 + }, + { + "epoch": 2.758809311792393, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6264575719833374, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8805637359619141, + "num_tokens": 827436285.0, + "step": 21687 + }, + { + "epoch": 2.758936522070983, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.640467882156372, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8878171443939209, + "num_tokens": 827471462.0, + "step": 21688 + }, + { + "epoch": 2.7590637323495737, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5699665546417236, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.864022433757782, + "num_tokens": 827515111.0, + "step": 21689 + }, + { + "epoch": 2.7591909426281642, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.617711067199707, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8757907152175903, + "num_tokens": 827553624.0, + "step": 21690 + }, + { + "epoch": 2.7593181529067548, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7466562986373901, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8807762265205383, + "num_tokens": 827589697.0, + "step": 21691 + }, + { + "epoch": 2.7594453631853453, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5798945426940918, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8783153295516968, + "num_tokens": 827631805.0, + "step": 21692 + }, + { + "epoch": 2.759572573463936, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6457349061965942, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8657206892967224, + "num_tokens": 827669939.0, + "step": 21693 + }, + { + "epoch": 2.7596997837425263, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6954801082611084, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8556492328643799, + "num_tokens": 827710114.0, + "step": 21694 + }, + { + "epoch": 2.759826994021117, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6972965002059937, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8809143304824829, + "num_tokens": 827748286.0, + "step": 21695 + }, + { + "epoch": 2.7599542042997074, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7407252788543701, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8771979212760925, + "num_tokens": 827782587.0, + "step": 21696 + }, + { + "epoch": 2.760081414578298, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.595574140548706, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8648494482040405, + "num_tokens": 827825379.0, + "step": 21697 + }, + { + "epoch": 2.7602086248568884, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.668595314025879, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8895650506019592, + "num_tokens": 827861614.0, + "step": 21698 + }, + { + "epoch": 2.760335835135479, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7332252264022827, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8680638074874878, + "num_tokens": 827902218.0, + "step": 21699 + }, + { + "epoch": 2.7604630454140695, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7598567008972168, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8700802326202393, + "num_tokens": 827937837.0, + "step": 21700 + }, + { + "epoch": 2.76059025569266, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.724179744720459, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8748966455459595, + "num_tokens": 827972063.0, + "step": 21701 + }, + { + "epoch": 2.7607174659712506, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6098774671554565, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8831424713134766, + "num_tokens": 828013286.0, + "step": 21702 + }, + { + "epoch": 2.760844676249841, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6708698272705078, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8788254857063293, + "num_tokens": 828047606.0, + "step": 21703 + }, + { + "epoch": 2.7609718865284316, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.786970615386963, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8697470426559448, + "num_tokens": 828084814.0, + "step": 21704 + }, + { + "epoch": 2.761099096807022, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6750965118408203, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8795357942581177, + "num_tokens": 828124693.0, + "step": 21705 + }, + { + "epoch": 2.7612263070856127, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6945260763168335, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8606451749801636, + "num_tokens": 828165418.0, + "step": 21706 + }, + { + "epoch": 2.7613535173642028, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6776554584503174, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8832394480705261, + "num_tokens": 828204633.0, + "step": 21707 + }, + { + "epoch": 2.7614807276427937, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5239609479904175, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8829855918884277, + "num_tokens": 828244880.0, + "step": 21708 + }, + { + "epoch": 2.761607937921384, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6373271942138672, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8703608512878418, + "num_tokens": 828283551.0, + "step": 21709 + }, + { + "epoch": 2.761735148199975, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7791070938110352, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8711555004119873, + "num_tokens": 828320466.0, + "step": 21710 + }, + { + "epoch": 2.761862358478565, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7386095523834229, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8662107586860657, + "num_tokens": 828359021.0, + "step": 21711 + }, + { + "epoch": 2.761989568757156, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5464144945144653, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8759790658950806, + "num_tokens": 828399289.0, + "step": 21712 + }, + { + "epoch": 2.762116779035746, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.690355896949768, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8636713027954102, + "num_tokens": 828435460.0, + "step": 21713 + }, + { + "epoch": 2.7622439893143365, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6646664142608643, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8598462343215942, + "num_tokens": 828472730.0, + "step": 21714 + }, + { + "epoch": 2.762371199592927, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5770326852798462, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8826050162315369, + "num_tokens": 828513688.0, + "step": 21715 + }, + { + "epoch": 2.7624984098715175, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.660522222518921, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8659223318099976, + "num_tokens": 828553482.0, + "step": 21716 + }, + { + "epoch": 2.762625620150108, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5648329257965088, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8768482208251953, + "num_tokens": 828594122.0, + "step": 21717 + }, + { + "epoch": 2.7627528304286986, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6104648113250732, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8776402473449707, + "num_tokens": 828632846.0, + "step": 21718 + }, + { + "epoch": 2.762880040707289, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6177912950515747, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8602321147918701, + "num_tokens": 828671880.0, + "step": 21719 + }, + { + "epoch": 2.7630072509858796, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6527912616729736, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8825681209564209, + "num_tokens": 828711759.0, + "step": 21720 + }, + { + "epoch": 2.76313446126447, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.624326467514038, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.868984580039978, + "num_tokens": 828752126.0, + "step": 21721 + }, + { + "epoch": 2.7632616715430607, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6492371559143066, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8728868961334229, + "num_tokens": 828790633.0, + "step": 21722 + }, + { + "epoch": 2.763388881821651, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7813506126403809, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8524094820022583, + "num_tokens": 828831553.0, + "step": 21723 + }, + { + "epoch": 2.7635160921002417, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7287038564682007, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8859385251998901, + "num_tokens": 828865522.0, + "step": 21724 + }, + { + "epoch": 2.7636433023788323, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7020702362060547, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8818615078926086, + "num_tokens": 828900142.0, + "step": 21725 + }, + { + "epoch": 2.763770512657423, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4723803997039795, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8835222125053406, + "num_tokens": 828943183.0, + "step": 21726 + }, + { + "epoch": 2.7638977229360133, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6018187999725342, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8806290626525879, + "num_tokens": 828981113.0, + "step": 21727 + }, + { + "epoch": 2.764024933214604, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.61892569065094, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8715713024139404, + "num_tokens": 829019773.0, + "step": 21728 + }, + { + "epoch": 2.7641521434931944, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.644079327583313, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8789547085762024, + "num_tokens": 829056449.0, + "step": 21729 + }, + { + "epoch": 2.764279353771785, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6876872777938843, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8580633997917175, + "num_tokens": 829094531.0, + "step": 21730 + }, + { + "epoch": 2.7644065640503754, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6352534294128418, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8691589832305908, + "num_tokens": 829134442.0, + "step": 21731 + }, + { + "epoch": 2.7645337743289655, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6641690731048584, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8756639957427979, + "num_tokens": 829174114.0, + "step": 21732 + }, + { + "epoch": 2.7646609846075565, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5061348676681519, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8750625848770142, + "num_tokens": 829217718.0, + "step": 21733 + }, + { + "epoch": 2.7647881948861466, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6820038557052612, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8773928880691528, + "num_tokens": 829254917.0, + "step": 21734 + }, + { + "epoch": 2.7649154051647375, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6877073049545288, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8699284791946411, + "num_tokens": 829293343.0, + "step": 21735 + }, + { + "epoch": 2.7650426154433276, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5117955207824707, + "learning_rate": 1e-06, + "loss": 0.3476, + "mean_token_accuracy": 0.8863705992698669, + "num_tokens": 829336581.0, + "step": 21736 + }, + { + "epoch": 2.7651698257219186, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6980175971984863, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8667263984680176, + "num_tokens": 829374062.0, + "step": 21737 + }, + { + "epoch": 2.7652970360005087, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6526951789855957, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8620584011077881, + "num_tokens": 829414539.0, + "step": 21738 + }, + { + "epoch": 2.765424246279099, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6710766553878784, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8731380701065063, + "num_tokens": 829454121.0, + "step": 21739 + }, + { + "epoch": 2.7655514565576897, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5672193765640259, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8700380921363831, + "num_tokens": 829495213.0, + "step": 21740 + }, + { + "epoch": 2.7656786668362803, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7333115339279175, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8699270486831665, + "num_tokens": 829528690.0, + "step": 21741 + }, + { + "epoch": 2.765805877114871, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.489647626876831, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.866288423538208, + "num_tokens": 829575709.0, + "step": 21742 + }, + { + "epoch": 2.7659330873934613, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5841912031173706, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8682832717895508, + "num_tokens": 829617030.0, + "step": 21743 + }, + { + "epoch": 2.766060297672052, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6575134992599487, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8778877258300781, + "num_tokens": 829654918.0, + "step": 21744 + }, + { + "epoch": 2.7661875079506424, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.595569133758545, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8776378631591797, + "num_tokens": 829696645.0, + "step": 21745 + }, + { + "epoch": 2.766314718229233, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5821715593338013, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.879696786403656, + "num_tokens": 829735658.0, + "step": 21746 + }, + { + "epoch": 2.7664419285078234, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5971354246139526, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.875906229019165, + "num_tokens": 829772385.0, + "step": 21747 + }, + { + "epoch": 2.766569138786414, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5466506481170654, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8746063709259033, + "num_tokens": 829811355.0, + "step": 21748 + }, + { + "epoch": 2.7666963490650045, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7187618017196655, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.885764479637146, + "num_tokens": 829842074.0, + "step": 21749 + }, + { + "epoch": 2.766823559343595, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6462459564208984, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8764033317565918, + "num_tokens": 829877715.0, + "step": 21750 + }, + { + "epoch": 2.7669507696221856, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.660685658454895, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8570283055305481, + "num_tokens": 829920950.0, + "step": 21751 + }, + { + "epoch": 2.767077979900776, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6278983354568481, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8764580488204956, + "num_tokens": 829960710.0, + "step": 21752 + }, + { + "epoch": 2.7672051901793666, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8635424375534058, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8862883448600769, + "num_tokens": 829995538.0, + "step": 21753 + }, + { + "epoch": 2.767332400457957, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6140245199203491, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8708139657974243, + "num_tokens": 830037232.0, + "step": 21754 + }, + { + "epoch": 2.7674596107365477, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7955045700073242, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8728858232498169, + "num_tokens": 830073683.0, + "step": 21755 + }, + { + "epoch": 2.767586821015138, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.66771399974823, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8859924077987671, + "num_tokens": 830107937.0, + "step": 21756 + }, + { + "epoch": 2.7677140312937283, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7591997385025024, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8681395053863525, + "num_tokens": 830144044.0, + "step": 21757 + }, + { + "epoch": 2.7678412415723193, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6590481996536255, + "learning_rate": 1e-06, + "loss": 0.3486, + "mean_token_accuracy": 0.884014904499054, + "num_tokens": 830185408.0, + "step": 21758 + }, + { + "epoch": 2.7679684518509093, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7924206256866455, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8893982172012329, + "num_tokens": 830221728.0, + "step": 21759 + }, + { + "epoch": 2.7680956621295003, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6064728498458862, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8771389126777649, + "num_tokens": 830263993.0, + "step": 21760 + }, + { + "epoch": 2.7682228724080904, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5347646474838257, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8679083585739136, + "num_tokens": 830305862.0, + "step": 21761 + }, + { + "epoch": 2.768350082686681, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6553727388381958, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8751777410507202, + "num_tokens": 830344342.0, + "step": 21762 + }, + { + "epoch": 2.7684772929652715, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.683231234550476, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8727648258209229, + "num_tokens": 830382988.0, + "step": 21763 + }, + { + "epoch": 2.768604503243862, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6109397411346436, + "learning_rate": 1e-06, + "loss": 0.3341, + "mean_token_accuracy": 0.8898836374282837, + "num_tokens": 830420762.0, + "step": 21764 + }, + { + "epoch": 2.7687317135224525, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5335625410079956, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8941020965576172, + "num_tokens": 830460053.0, + "step": 21765 + }, + { + "epoch": 2.768858923801043, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5142124891281128, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8882190585136414, + "num_tokens": 830504238.0, + "step": 21766 + }, + { + "epoch": 2.7689861340796336, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5438636541366577, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8853811025619507, + "num_tokens": 830542998.0, + "step": 21767 + }, + { + "epoch": 2.769113344358224, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6003180742263794, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8696609735488892, + "num_tokens": 830582808.0, + "step": 21768 + }, + { + "epoch": 2.7692405546368146, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6477304697036743, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8729352951049805, + "num_tokens": 830620958.0, + "step": 21769 + }, + { + "epoch": 2.769367764915405, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6308642625808716, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8771976232528687, + "num_tokens": 830658660.0, + "step": 21770 + }, + { + "epoch": 2.7694949751939957, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5491960048675537, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.875545084476471, + "num_tokens": 830699519.0, + "step": 21771 + }, + { + "epoch": 2.769622185472586, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7828962802886963, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.885301947593689, + "num_tokens": 830731650.0, + "step": 21772 + }, + { + "epoch": 2.7697493957511767, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6121803522109985, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.873753011226654, + "num_tokens": 830771761.0, + "step": 21773 + }, + { + "epoch": 2.7698766060297673, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.688011646270752, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8632141351699829, + "num_tokens": 830816067.0, + "step": 21774 + }, + { + "epoch": 2.770003816308358, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5729175806045532, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8662898540496826, + "num_tokens": 830860562.0, + "step": 21775 + }, + { + "epoch": 2.7701310265869483, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6328905820846558, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8757607936859131, + "num_tokens": 830896326.0, + "step": 21776 + }, + { + "epoch": 2.770258236865539, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6574000120162964, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8663017153739929, + "num_tokens": 830933757.0, + "step": 21777 + }, + { + "epoch": 2.7703854471441294, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6512483358383179, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8799049854278564, + "num_tokens": 830968957.0, + "step": 21778 + }, + { + "epoch": 2.77051265742272, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8001781702041626, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8520987033843994, + "num_tokens": 831006366.0, + "step": 21779 + }, + { + "epoch": 2.77063986770131, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7399927377700806, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8613501787185669, + "num_tokens": 831043478.0, + "step": 21780 + }, + { + "epoch": 2.770767077979901, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.685509204864502, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8630327582359314, + "num_tokens": 831082077.0, + "step": 21781 + }, + { + "epoch": 2.770894288258491, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5631321668624878, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8671240210533142, + "num_tokens": 831123573.0, + "step": 21782 + }, + { + "epoch": 2.771021498537082, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.550029993057251, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8805137872695923, + "num_tokens": 831165987.0, + "step": 21783 + }, + { + "epoch": 2.771148708815672, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6567610502243042, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8752950429916382, + "num_tokens": 831199404.0, + "step": 21784 + }, + { + "epoch": 2.771275919094263, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7781678438186646, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8592019081115723, + "num_tokens": 831232131.0, + "step": 21785 + }, + { + "epoch": 2.771403129372853, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.582061529159546, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8729050159454346, + "num_tokens": 831267842.0, + "step": 21786 + }, + { + "epoch": 2.7715303396514437, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.687217354774475, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8716553449630737, + "num_tokens": 831302333.0, + "step": 21787 + }, + { + "epoch": 2.771657549930034, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.642948865890503, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8678721189498901, + "num_tokens": 831342019.0, + "step": 21788 + }, + { + "epoch": 2.7717847602086247, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7531334161758423, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8787001371383667, + "num_tokens": 831376986.0, + "step": 21789 + }, + { + "epoch": 2.7719119704872153, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6681076288223267, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.882418155670166, + "num_tokens": 831418003.0, + "step": 21790 + }, + { + "epoch": 2.772039180765806, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5403594970703125, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8746960163116455, + "num_tokens": 831461075.0, + "step": 21791 + }, + { + "epoch": 2.7721663910443963, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6251929998397827, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8682841062545776, + "num_tokens": 831498512.0, + "step": 21792 + }, + { + "epoch": 2.772293601322987, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5593634843826294, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8698164224624634, + "num_tokens": 831539135.0, + "step": 21793 + }, + { + "epoch": 2.7724208116015774, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5774024724960327, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8734013438224792, + "num_tokens": 831578736.0, + "step": 21794 + }, + { + "epoch": 2.772548021880168, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7008212804794312, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8823444843292236, + "num_tokens": 831619599.0, + "step": 21795 + }, + { + "epoch": 2.7726752321587584, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8585768938064575, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8646283149719238, + "num_tokens": 831656487.0, + "step": 21796 + }, + { + "epoch": 2.772802442437349, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6178117990493774, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8702154755592346, + "num_tokens": 831696274.0, + "step": 21797 + }, + { + "epoch": 2.7729296527159395, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6732707023620605, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8743123412132263, + "num_tokens": 831734230.0, + "step": 21798 + }, + { + "epoch": 2.77305686299453, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6832941770553589, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8866453170776367, + "num_tokens": 831772754.0, + "step": 21799 + }, + { + "epoch": 2.7731840732731206, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6957348585128784, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8720556497573853, + "num_tokens": 831809692.0, + "step": 21800 + }, + { + "epoch": 2.773311283551711, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7431776523590088, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8890984654426575, + "num_tokens": 831842078.0, + "step": 21801 + }, + { + "epoch": 2.7734384938303016, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.729919672012329, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8784250020980835, + "num_tokens": 831875273.0, + "step": 21802 + }, + { + "epoch": 2.773565704108892, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.763893485069275, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8757724761962891, + "num_tokens": 831913118.0, + "step": 21803 + }, + { + "epoch": 2.7736929143874827, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6520322561264038, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8871378898620605, + "num_tokens": 831950661.0, + "step": 21804 + }, + { + "epoch": 2.7738201246660728, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.526784062385559, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8694748878479004, + "num_tokens": 831989506.0, + "step": 21805 + }, + { + "epoch": 2.7739473349446637, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7607669830322266, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8680285215377808, + "num_tokens": 832027971.0, + "step": 21806 + }, + { + "epoch": 2.774074545223254, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8523017168045044, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8634018301963806, + "num_tokens": 832060955.0, + "step": 21807 + }, + { + "epoch": 2.774201755501845, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7752304077148438, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8812263011932373, + "num_tokens": 832097110.0, + "step": 21808 + }, + { + "epoch": 2.774328965780435, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6404322385787964, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8794028759002686, + "num_tokens": 832129478.0, + "step": 21809 + }, + { + "epoch": 2.774456176059026, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.9592748880386353, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8770552277565002, + "num_tokens": 832162549.0, + "step": 21810 + }, + { + "epoch": 2.774583386337616, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7309212684631348, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8795502185821533, + "num_tokens": 832201145.0, + "step": 21811 + }, + { + "epoch": 2.7747105966162064, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6506643295288086, + "learning_rate": 1e-06, + "loss": 0.3397, + "mean_token_accuracy": 0.8888945579528809, + "num_tokens": 832236862.0, + "step": 21812 + }, + { + "epoch": 2.774837806894797, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6686317920684814, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8834295272827148, + "num_tokens": 832275292.0, + "step": 21813 + }, + { + "epoch": 2.7749650171733875, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7015107870101929, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8688029646873474, + "num_tokens": 832311864.0, + "step": 21814 + }, + { + "epoch": 2.775092227451978, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7010166645050049, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.876846194267273, + "num_tokens": 832352047.0, + "step": 21815 + }, + { + "epoch": 2.7752194377305686, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8614312410354614, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.861030101776123, + "num_tokens": 832386608.0, + "step": 21816 + }, + { + "epoch": 2.775346648009159, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6248723268508911, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8819400668144226, + "num_tokens": 832426924.0, + "step": 21817 + }, + { + "epoch": 2.7754738582877496, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6288244724273682, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8770904541015625, + "num_tokens": 832466619.0, + "step": 21818 + }, + { + "epoch": 2.77560106856634, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5524934530258179, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8848413228988647, + "num_tokens": 832505944.0, + "step": 21819 + }, + { + "epoch": 2.7757282788449307, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6642035245895386, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8625727891921997, + "num_tokens": 832544462.0, + "step": 21820 + }, + { + "epoch": 2.775855489123521, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6361632347106934, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8600834608078003, + "num_tokens": 832586175.0, + "step": 21821 + }, + { + "epoch": 2.7759826994021117, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.585239291191101, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8865367770195007, + "num_tokens": 832620818.0, + "step": 21822 + }, + { + "epoch": 2.7761099096807023, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.772812843322754, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8804277181625366, + "num_tokens": 832652365.0, + "step": 21823 + }, + { + "epoch": 2.776237119959293, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.691606879234314, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8751807808876038, + "num_tokens": 832691332.0, + "step": 21824 + }, + { + "epoch": 2.7763643302378833, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5832760334014893, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8705161809921265, + "num_tokens": 832736531.0, + "step": 21825 + }, + { + "epoch": 2.776491540516474, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.69187593460083, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8827400803565979, + "num_tokens": 832769730.0, + "step": 21826 + }, + { + "epoch": 2.7766187507950644, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6772973537445068, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.877358078956604, + "num_tokens": 832806547.0, + "step": 21827 + }, + { + "epoch": 2.776745961073655, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7985819578170776, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8744055032730103, + "num_tokens": 832840661.0, + "step": 21828 + }, + { + "epoch": 2.7768731713522454, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.675622582435608, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8750956058502197, + "num_tokens": 832877400.0, + "step": 21829 + }, + { + "epoch": 2.7770003816308355, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6389662027359009, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8821097016334534, + "num_tokens": 832913454.0, + "step": 21830 + }, + { + "epoch": 2.7771275919094265, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.452991008758545, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8747329115867615, + "num_tokens": 832958261.0, + "step": 21831 + }, + { + "epoch": 2.7772548021880166, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6243354082107544, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8674190044403076, + "num_tokens": 832996045.0, + "step": 21832 + }, + { + "epoch": 2.7773820124666075, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6432892084121704, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8697409629821777, + "num_tokens": 833037022.0, + "step": 21833 + }, + { + "epoch": 2.7775092227451976, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5995635986328125, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.873725175857544, + "num_tokens": 833078987.0, + "step": 21834 + }, + { + "epoch": 2.7776364330237886, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7995010614395142, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.870567262172699, + "num_tokens": 833110017.0, + "step": 21835 + }, + { + "epoch": 2.7777636433023787, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5280078649520874, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8781760931015015, + "num_tokens": 833154045.0, + "step": 21836 + }, + { + "epoch": 2.777890853580969, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.582793951034546, + "learning_rate": 1e-06, + "loss": 0.3345, + "mean_token_accuracy": 0.890177309513092, + "num_tokens": 833192530.0, + "step": 21837 + }, + { + "epoch": 2.7780180638595597, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6916207075119019, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8690601587295532, + "num_tokens": 833227498.0, + "step": 21838 + }, + { + "epoch": 2.7781452741381503, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5282273292541504, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8860138058662415, + "num_tokens": 833265792.0, + "step": 21839 + }, + { + "epoch": 2.778272484416741, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.4878363609313965, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8711007237434387, + "num_tokens": 833310473.0, + "step": 21840 + }, + { + "epoch": 2.7783996946953313, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6601508855819702, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8724520802497864, + "num_tokens": 833351286.0, + "step": 21841 + }, + { + "epoch": 2.778526904973922, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.673941731452942, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8662121891975403, + "num_tokens": 833391388.0, + "step": 21842 + }, + { + "epoch": 2.7786541152525124, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6672135591506958, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8766871690750122, + "num_tokens": 833428338.0, + "step": 21843 + }, + { + "epoch": 2.778781325531103, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7192509174346924, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8702921867370605, + "num_tokens": 833462704.0, + "step": 21844 + }, + { + "epoch": 2.7789085358096934, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6865925788879395, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.865429162979126, + "num_tokens": 833500651.0, + "step": 21845 + }, + { + "epoch": 2.779035746088284, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6171903610229492, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8836725354194641, + "num_tokens": 833534970.0, + "step": 21846 + }, + { + "epoch": 2.7791629563668745, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5267481803894043, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8756552934646606, + "num_tokens": 833574138.0, + "step": 21847 + }, + { + "epoch": 2.779290166645465, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6790028810501099, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8667837977409363, + "num_tokens": 833615535.0, + "step": 21848 + }, + { + "epoch": 2.7794173769240555, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7038975954055786, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8777625560760498, + "num_tokens": 833651198.0, + "step": 21849 + }, + { + "epoch": 2.779544587202646, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8050886392593384, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8675085306167603, + "num_tokens": 833683303.0, + "step": 21850 + }, + { + "epoch": 2.7796717974812366, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5294551849365234, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8771227598190308, + "num_tokens": 833729397.0, + "step": 21851 + }, + { + "epoch": 2.779799007759827, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6035746335983276, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8785635232925415, + "num_tokens": 833766728.0, + "step": 21852 + }, + { + "epoch": 2.7799262180384177, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.734878659248352, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8674383759498596, + "num_tokens": 833802118.0, + "step": 21853 + }, + { + "epoch": 2.780053428317008, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7943062782287598, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8602275848388672, + "num_tokens": 833835948.0, + "step": 21854 + }, + { + "epoch": 2.7801806385955983, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.666155457496643, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.882367730140686, + "num_tokens": 833870229.0, + "step": 21855 + }, + { + "epoch": 2.7803078488741892, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.66932213306427, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8842653632164001, + "num_tokens": 833906562.0, + "step": 21856 + }, + { + "epoch": 2.7804350591527793, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6568920612335205, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8755978345870972, + "num_tokens": 833942894.0, + "step": 21857 + }, + { + "epoch": 2.7805622694313703, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5892715454101562, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8711671829223633, + "num_tokens": 833983927.0, + "step": 21858 + }, + { + "epoch": 2.7806894797099604, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7344565391540527, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8756064176559448, + "num_tokens": 834016813.0, + "step": 21859 + }, + { + "epoch": 2.780816689988551, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.638775110244751, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8902930021286011, + "num_tokens": 834054137.0, + "step": 21860 + }, + { + "epoch": 2.7809439002671414, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7441061735153198, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8693661093711853, + "num_tokens": 834090151.0, + "step": 21861 + }, + { + "epoch": 2.781071110545732, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5532399415969849, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8765794038772583, + "num_tokens": 834132591.0, + "step": 21862 + }, + { + "epoch": 2.7811983208243225, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5948511362075806, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8632578253746033, + "num_tokens": 834173841.0, + "step": 21863 + }, + { + "epoch": 2.781325531102913, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.812503695487976, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8633596301078796, + "num_tokens": 834205904.0, + "step": 21864 + }, + { + "epoch": 2.7814527413815036, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 2.293776273727417, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8832108974456787, + "num_tokens": 834238283.0, + "step": 21865 + }, + { + "epoch": 2.781579951660094, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7737985849380493, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.877326488494873, + "num_tokens": 834271747.0, + "step": 21866 + }, + { + "epoch": 2.7817071619386846, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7572684288024902, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8693339228630066, + "num_tokens": 834308353.0, + "step": 21867 + }, + { + "epoch": 2.781834372217275, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6414169073104858, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8798978328704834, + "num_tokens": 834346473.0, + "step": 21868 + }, + { + "epoch": 2.7819615824958657, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7225233316421509, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8747451305389404, + "num_tokens": 834380938.0, + "step": 21869 + }, + { + "epoch": 2.782088792774456, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7805681228637695, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8753641843795776, + "num_tokens": 834411946.0, + "step": 21870 + }, + { + "epoch": 2.7822160030530467, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7275704145431519, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8769723773002625, + "num_tokens": 834446923.0, + "step": 21871 + }, + { + "epoch": 2.7823432133316373, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6267597675323486, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8782927989959717, + "num_tokens": 834483712.0, + "step": 21872 + }, + { + "epoch": 2.782470423610228, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.635948896408081, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8714435696601868, + "num_tokens": 834523113.0, + "step": 21873 + }, + { + "epoch": 2.7825976338888183, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7799439430236816, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8672013282775879, + "num_tokens": 834554904.0, + "step": 21874 + }, + { + "epoch": 2.782724844167409, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7063840627670288, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8854563236236572, + "num_tokens": 834590538.0, + "step": 21875 + }, + { + "epoch": 2.7828520544459994, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7038049697875977, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8720234036445618, + "num_tokens": 834626024.0, + "step": 21876 + }, + { + "epoch": 2.78297926472459, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5093780755996704, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.880699634552002, + "num_tokens": 834666500.0, + "step": 21877 + }, + { + "epoch": 2.78310647500318, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 2.30731463432312, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.875778079032898, + "num_tokens": 834701377.0, + "step": 21878 + }, + { + "epoch": 2.783233685281771, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5555092096328735, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8675113320350647, + "num_tokens": 834742941.0, + "step": 21879 + }, + { + "epoch": 2.783360895560361, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6932909488677979, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8713833093643188, + "num_tokens": 834776789.0, + "step": 21880 + }, + { + "epoch": 2.783488105838952, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6691433191299438, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8697377443313599, + "num_tokens": 834813707.0, + "step": 21881 + }, + { + "epoch": 2.783615316117542, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6362457275390625, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8634420037269592, + "num_tokens": 834858519.0, + "step": 21882 + }, + { + "epoch": 2.783742526396133, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.46291184425354, + "learning_rate": 1e-06, + "loss": 0.3267, + "mean_token_accuracy": 0.8941653966903687, + "num_tokens": 834900122.0, + "step": 21883 + }, + { + "epoch": 2.783869736674723, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7880631685256958, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8589122891426086, + "num_tokens": 834940399.0, + "step": 21884 + }, + { + "epoch": 2.7839969469533137, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6517693996429443, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8601174354553223, + "num_tokens": 834981253.0, + "step": 21885 + }, + { + "epoch": 2.784124157231904, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5655059814453125, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.891470193862915, + "num_tokens": 835018722.0, + "step": 21886 + }, + { + "epoch": 2.7842513675104947, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5904794931411743, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8486477136611938, + "num_tokens": 835062154.0, + "step": 21887 + }, + { + "epoch": 2.7843785777890853, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6238350868225098, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8754991292953491, + "num_tokens": 835101924.0, + "step": 21888 + }, + { + "epoch": 2.784505788067676, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7662155628204346, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8740866780281067, + "num_tokens": 835133009.0, + "step": 21889 + }, + { + "epoch": 2.7846329983462663, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.569907307624817, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8658403158187866, + "num_tokens": 835177316.0, + "step": 21890 + }, + { + "epoch": 2.784760208624857, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7099727392196655, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8642734289169312, + "num_tokens": 835214853.0, + "step": 21891 + }, + { + "epoch": 2.7848874189034474, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6537636518478394, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8744165897369385, + "num_tokens": 835249178.0, + "step": 21892 + }, + { + "epoch": 2.785014629182038, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.69585382938385, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8868504166603088, + "num_tokens": 835280559.0, + "step": 21893 + }, + { + "epoch": 2.7851418394606284, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7760502099990845, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8720953464508057, + "num_tokens": 835317892.0, + "step": 21894 + }, + { + "epoch": 2.785269049739219, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6660183668136597, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8789479732513428, + "num_tokens": 835351699.0, + "step": 21895 + }, + { + "epoch": 2.7853962600178095, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5524266958236694, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8717455267906189, + "num_tokens": 835393887.0, + "step": 21896 + }, + { + "epoch": 2.7855234702964, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.567021369934082, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8692083358764648, + "num_tokens": 835435551.0, + "step": 21897 + }, + { + "epoch": 2.7856506805749905, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6225388050079346, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8859060406684875, + "num_tokens": 835474046.0, + "step": 21898 + }, + { + "epoch": 2.785777890853581, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4866551160812378, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8736147880554199, + "num_tokens": 835515761.0, + "step": 21899 + }, + { + "epoch": 2.7859051011321716, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5737426280975342, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8815147280693054, + "num_tokens": 835554332.0, + "step": 21900 + }, + { + "epoch": 2.786032311410762, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.654715895652771, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8819605112075806, + "num_tokens": 835592583.0, + "step": 21901 + }, + { + "epoch": 2.7861595216893527, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6945003271102905, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.867059051990509, + "num_tokens": 835632271.0, + "step": 21902 + }, + { + "epoch": 2.7862867319679427, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8467633724212646, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8625366687774658, + "num_tokens": 835665480.0, + "step": 21903 + }, + { + "epoch": 2.7864139422465337, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5932495594024658, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8743793368339539, + "num_tokens": 835707604.0, + "step": 21904 + }, + { + "epoch": 2.786541152525124, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5150576829910278, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8873652815818787, + "num_tokens": 835748248.0, + "step": 21905 + }, + { + "epoch": 2.7866683628037148, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5818310976028442, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8776370286941528, + "num_tokens": 835788674.0, + "step": 21906 + }, + { + "epoch": 2.786795573082305, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6192113161087036, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8840795755386353, + "num_tokens": 835825776.0, + "step": 21907 + }, + { + "epoch": 2.786922783360896, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.80552339553833, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8475112915039062, + "num_tokens": 835864124.0, + "step": 21908 + }, + { + "epoch": 2.787049993639486, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6753438711166382, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.8920130729675293, + "num_tokens": 835904783.0, + "step": 21909 + }, + { + "epoch": 2.7871772039180764, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7984683513641357, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8728800415992737, + "num_tokens": 835938609.0, + "step": 21910 + }, + { + "epoch": 2.787304414196667, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5844881534576416, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8801947832107544, + "num_tokens": 835979508.0, + "step": 21911 + }, + { + "epoch": 2.7874316244752575, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.686423897743225, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8755871057510376, + "num_tokens": 836017548.0, + "step": 21912 + }, + { + "epoch": 2.787558834753848, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7772423028945923, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8740196228027344, + "num_tokens": 836055512.0, + "step": 21913 + }, + { + "epoch": 2.7876860450324386, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6405199766159058, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.860127329826355, + "num_tokens": 836093986.0, + "step": 21914 + }, + { + "epoch": 2.787813255311029, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.620926856994629, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8838411569595337, + "num_tokens": 836130449.0, + "step": 21915 + }, + { + "epoch": 2.7879404655896196, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6176737546920776, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8740863800048828, + "num_tokens": 836172008.0, + "step": 21916 + }, + { + "epoch": 2.78806767586821, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6850383281707764, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.885999321937561, + "num_tokens": 836204336.0, + "step": 21917 + }, + { + "epoch": 2.7881948861468007, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6099047660827637, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8783424496650696, + "num_tokens": 836238649.0, + "step": 21918 + }, + { + "epoch": 2.788322096425391, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7524949312210083, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8705803155899048, + "num_tokens": 836277036.0, + "step": 21919 + }, + { + "epoch": 2.7884493067039817, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6167476177215576, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8808237910270691, + "num_tokens": 836315219.0, + "step": 21920 + }, + { + "epoch": 2.7885765169825723, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6155391931533813, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8645884394645691, + "num_tokens": 836357297.0, + "step": 21921 + }, + { + "epoch": 2.788703727261163, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7801294326782227, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8707724809646606, + "num_tokens": 836390032.0, + "step": 21922 + }, + { + "epoch": 2.7888309375397533, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4654179811477661, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8840140700340271, + "num_tokens": 836432260.0, + "step": 21923 + }, + { + "epoch": 2.788958147818344, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.713403344154358, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8789472579956055, + "num_tokens": 836470976.0, + "step": 21924 + }, + { + "epoch": 2.7890853580969344, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5845844745635986, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8806055784225464, + "num_tokens": 836508951.0, + "step": 21925 + }, + { + "epoch": 2.789212568375525, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.484302282333374, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8767116069793701, + "num_tokens": 836553621.0, + "step": 21926 + }, + { + "epoch": 2.7893397786541154, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.690045952796936, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8586270809173584, + "num_tokens": 836594133.0, + "step": 21927 + }, + { + "epoch": 2.7894669889327055, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.644091248512268, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.883510947227478, + "num_tokens": 836628582.0, + "step": 21928 + }, + { + "epoch": 2.7895941992112965, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6204888820648193, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.882457971572876, + "num_tokens": 836665215.0, + "step": 21929 + }, + { + "epoch": 2.7897214094898866, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.833449363708496, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8727668523788452, + "num_tokens": 836694155.0, + "step": 21930 + }, + { + "epoch": 2.7898486197684775, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8312972784042358, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8777492046356201, + "num_tokens": 836732367.0, + "step": 21931 + }, + { + "epoch": 2.7899758300470676, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.670992136001587, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8718771934509277, + "num_tokens": 836769911.0, + "step": 21932 + }, + { + "epoch": 2.7901030403256586, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7712911367416382, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8805159330368042, + "num_tokens": 836804330.0, + "step": 21933 + }, + { + "epoch": 2.7902302506042487, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7174946069717407, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8815553188323975, + "num_tokens": 836835695.0, + "step": 21934 + }, + { + "epoch": 2.790357460882839, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8861807584762573, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8540560007095337, + "num_tokens": 836866119.0, + "step": 21935 + }, + { + "epoch": 2.7904846711614297, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.738736867904663, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8752681016921997, + "num_tokens": 836898450.0, + "step": 21936 + }, + { + "epoch": 2.7906118814400203, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.7070403099060059, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8561161756515503, + "num_tokens": 836939757.0, + "step": 21937 + }, + { + "epoch": 2.790739091718611, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.6776607036590576, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8771592974662781, + "num_tokens": 836978219.0, + "step": 21938 + }, + { + "epoch": 2.7908663019972013, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.8105417490005493, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8681710958480835, + "num_tokens": 837013355.0, + "step": 21939 + }, + { + "epoch": 2.790993512275792, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6561380624771118, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8770656585693359, + "num_tokens": 837050745.0, + "step": 21940 + }, + { + "epoch": 2.7911207225543824, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.6352410316467285, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8739215135574341, + "num_tokens": 837084122.0, + "step": 21941 + }, + { + "epoch": 2.791247932832973, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.5603392124176025, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.873611330986023, + "num_tokens": 837123448.0, + "step": 21942 + }, + { + "epoch": 2.7913751431115634, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.6739752292633057, + "learning_rate": 1e-06, + "loss": 0.3498, + "mean_token_accuracy": 0.884941041469574, + "num_tokens": 837155007.0, + "step": 21943 + }, + { + "epoch": 2.791502353390154, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.793993592262268, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8758966326713562, + "num_tokens": 837185770.0, + "step": 21944 + }, + { + "epoch": 2.7916295636687445, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.619332194328308, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8665833473205566, + "num_tokens": 837227101.0, + "step": 21945 + }, + { + "epoch": 2.791756773947335, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6432256698608398, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8847131729125977, + "num_tokens": 837262749.0, + "step": 21946 + }, + { + "epoch": 2.7918839842259255, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.6211482286453247, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8749729990959167, + "num_tokens": 837300552.0, + "step": 21947 + }, + { + "epoch": 2.792011194504516, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.6238712072372437, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8751238584518433, + "num_tokens": 837339758.0, + "step": 21948 + }, + { + "epoch": 2.7921384047831066, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.5933860540390015, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8778440952301025, + "num_tokens": 837380473.0, + "step": 21949 + }, + { + "epoch": 2.792265615061697, + "ewc_loss": 2.9206275939941406e-05, + "grad_norm": 1.563683271408081, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8760510683059692, + "num_tokens": 837418244.0, + "step": 21950 + }, + { + "epoch": 2.7923928253402877, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.5830315351486206, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8652751445770264, + "num_tokens": 837458147.0, + "step": 21951 + }, + { + "epoch": 2.792520035618878, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.4670542478561401, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8843970894813538, + "num_tokens": 837501821.0, + "step": 21952 + }, + { + "epoch": 2.7926472458974683, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8105716705322266, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8821946382522583, + "num_tokens": 837536407.0, + "step": 21953 + }, + { + "epoch": 2.7927744561760592, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6598314046859741, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8720201849937439, + "num_tokens": 837571647.0, + "step": 21954 + }, + { + "epoch": 2.7929016664546493, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7319482564926147, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8661943674087524, + "num_tokens": 837609071.0, + "step": 21955 + }, + { + "epoch": 2.7930288767332403, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.592185378074646, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8755779266357422, + "num_tokens": 837650169.0, + "step": 21956 + }, + { + "epoch": 2.7931560870118304, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8111017942428589, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8809491991996765, + "num_tokens": 837680854.0, + "step": 21957 + }, + { + "epoch": 2.793283297290421, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6590728759765625, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8797187805175781, + "num_tokens": 837715704.0, + "step": 21958 + }, + { + "epoch": 2.7934105075690114, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5628526210784912, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8643060922622681, + "num_tokens": 837754211.0, + "step": 21959 + }, + { + "epoch": 2.793537717847602, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7265214920043945, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8723073601722717, + "num_tokens": 837789833.0, + "step": 21960 + }, + { + "epoch": 2.7936649281261925, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5484933853149414, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.85565185546875, + "num_tokens": 837836248.0, + "step": 21961 + }, + { + "epoch": 2.793792138404783, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5527344942092896, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.867432713508606, + "num_tokens": 837880066.0, + "step": 21962 + }, + { + "epoch": 2.7939193486833735, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5979504585266113, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8761523962020874, + "num_tokens": 837917604.0, + "step": 21963 + }, + { + "epoch": 2.794046558961964, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8958985805511475, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8706470727920532, + "num_tokens": 837949802.0, + "step": 21964 + }, + { + "epoch": 2.7941737692405546, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.850750207901001, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8641754388809204, + "num_tokens": 837984486.0, + "step": 21965 + }, + { + "epoch": 2.794300979519145, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6574128866195679, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8752990961074829, + "num_tokens": 838024577.0, + "step": 21966 + }, + { + "epoch": 2.7944281897977357, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6305677890777588, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8691281080245972, + "num_tokens": 838063246.0, + "step": 21967 + }, + { + "epoch": 2.794555400076326, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.71385657787323, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8696370124816895, + "num_tokens": 838101802.0, + "step": 21968 + }, + { + "epoch": 2.7946826103549167, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9171562194824219, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8592754602432251, + "num_tokens": 838142551.0, + "step": 21969 + }, + { + "epoch": 2.7948098206335072, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.776581883430481, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8840011358261108, + "num_tokens": 838178954.0, + "step": 21970 + }, + { + "epoch": 2.7949370309120978, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6613909006118774, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8662399649620056, + "num_tokens": 838218443.0, + "step": 21971 + }, + { + "epoch": 2.7950642411906883, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5392568111419678, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8763880729675293, + "num_tokens": 838262796.0, + "step": 21972 + }, + { + "epoch": 2.795191451469279, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.5253714323043823, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8847422003746033, + "num_tokens": 838300948.0, + "step": 21973 + }, + { + "epoch": 2.7953186617478694, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.6186102628707886, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8785854578018188, + "num_tokens": 838336279.0, + "step": 21974 + }, + { + "epoch": 2.79544587202646, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.5480660200119019, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8572048544883728, + "num_tokens": 838383467.0, + "step": 21975 + }, + { + "epoch": 2.79557308230505, + "ewc_loss": 2.9325485229492188e-05, + "grad_norm": 1.6306719779968262, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8785163164138794, + "num_tokens": 838418993.0, + "step": 21976 + }, + { + "epoch": 2.795700292583641, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5469884872436523, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8688938021659851, + "num_tokens": 838460394.0, + "step": 21977 + }, + { + "epoch": 2.795827502862231, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6887911558151245, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8761323690414429, + "num_tokens": 838497312.0, + "step": 21978 + }, + { + "epoch": 2.795954713140822, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.590657114982605, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8711830973625183, + "num_tokens": 838539337.0, + "step": 21979 + }, + { + "epoch": 2.796081923419412, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.790952444076538, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.881566047668457, + "num_tokens": 838570918.0, + "step": 21980 + }, + { + "epoch": 2.796209133698003, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6634550094604492, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8740383386611938, + "num_tokens": 838610490.0, + "step": 21981 + }, + { + "epoch": 2.796336343976593, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6306205987930298, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8737545013427734, + "num_tokens": 838647226.0, + "step": 21982 + }, + { + "epoch": 2.7964635542551837, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5412667989730835, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8660780787467957, + "num_tokens": 838688156.0, + "step": 21983 + }, + { + "epoch": 2.796590764533774, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5231350660324097, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8760942220687866, + "num_tokens": 838732240.0, + "step": 21984 + }, + { + "epoch": 2.7967179748123647, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5774112939834595, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8574967980384827, + "num_tokens": 838777400.0, + "step": 21985 + }, + { + "epoch": 2.7968451850909553, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5846569538116455, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8747626543045044, + "num_tokens": 838818813.0, + "step": 21986 + }, + { + "epoch": 2.796972395369546, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6903725862503052, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8850111365318298, + "num_tokens": 838854296.0, + "step": 21987 + }, + { + "epoch": 2.7970996056481363, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8038369417190552, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8461971282958984, + "num_tokens": 838891704.0, + "step": 21988 + }, + { + "epoch": 2.797226815926727, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7294323444366455, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8895823359489441, + "num_tokens": 838922382.0, + "step": 21989 + }, + { + "epoch": 2.7973540262053174, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6982431411743164, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8654420971870422, + "num_tokens": 838957805.0, + "step": 21990 + }, + { + "epoch": 2.797481236483908, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.614043951034546, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8724256157875061, + "num_tokens": 838995811.0, + "step": 21991 + }, + { + "epoch": 2.7976084467624984, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6947364807128906, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8710892200469971, + "num_tokens": 839029914.0, + "step": 21992 + }, + { + "epoch": 2.797735657041089, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5725380182266235, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8815211653709412, + "num_tokens": 839068870.0, + "step": 21993 + }, + { + "epoch": 2.7978628673196795, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8138970136642456, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8797216415405273, + "num_tokens": 839099561.0, + "step": 21994 + }, + { + "epoch": 2.79799007759827, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.747514009475708, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8699682950973511, + "num_tokens": 839138029.0, + "step": 21995 + }, + { + "epoch": 2.7981172878768605, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5882920026779175, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8746019601821899, + "num_tokens": 839179621.0, + "step": 21996 + }, + { + "epoch": 2.798244498155451, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.520316481590271, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8788257837295532, + "num_tokens": 839219838.0, + "step": 21997 + }, + { + "epoch": 2.7983717084340416, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5058599710464478, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8883501291275024, + "num_tokens": 839259869.0, + "step": 21998 + }, + { + "epoch": 2.798498918712632, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.711443305015564, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8724817037582397, + "num_tokens": 839296179.0, + "step": 21999 + }, + { + "epoch": 2.7986261289912227, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6673072576522827, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.868000864982605, + "num_tokens": 839333535.0, + "step": 22000 + }, + { + "epoch": 2.7987533392698127, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6092896461486816, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8769034147262573, + "num_tokens": 839372167.0, + "step": 22001 + }, + { + "epoch": 2.7988805495484037, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5993202924728394, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.879331111907959, + "num_tokens": 839408436.0, + "step": 22002 + }, + { + "epoch": 2.799007759826994, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6943702697753906, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8718534708023071, + "num_tokens": 839444453.0, + "step": 22003 + }, + { + "epoch": 2.7991349701055848, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6242306232452393, + "learning_rate": 1e-06, + "loss": 0.3204, + "mean_token_accuracy": 0.8951761722564697, + "num_tokens": 839480775.0, + "step": 22004 + }, + { + "epoch": 2.799262180384175, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8126760721206665, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8615567684173584, + "num_tokens": 839516578.0, + "step": 22005 + }, + { + "epoch": 2.799389390662766, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5996021032333374, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8799881339073181, + "num_tokens": 839555669.0, + "step": 22006 + }, + { + "epoch": 2.799516600941356, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.609458088874817, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8887794613838196, + "num_tokens": 839592386.0, + "step": 22007 + }, + { + "epoch": 2.7996438112199464, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6130833625793457, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8808038234710693, + "num_tokens": 839632363.0, + "step": 22008 + }, + { + "epoch": 2.799771021498537, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.834801197052002, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8547186851501465, + "num_tokens": 839668182.0, + "step": 22009 + }, + { + "epoch": 2.7998982317771275, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6282423734664917, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8798165917396545, + "num_tokens": 839706436.0, + "step": 22010 + }, + { + "epoch": 2.800025442055718, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6724128723144531, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8832398653030396, + "num_tokens": 839741284.0, + "step": 22011 + }, + { + "epoch": 2.8001526523343085, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9121581315994263, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8587378263473511, + "num_tokens": 839774257.0, + "step": 22012 + }, + { + "epoch": 2.800279862612899, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8238798379898071, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8697139620780945, + "num_tokens": 839810323.0, + "step": 22013 + }, + { + "epoch": 2.8004070728914896, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.70982825756073, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.85720294713974, + "num_tokens": 839847972.0, + "step": 22014 + }, + { + "epoch": 2.80053428317008, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5367339849472046, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8719886541366577, + "num_tokens": 839890610.0, + "step": 22015 + }, + { + "epoch": 2.8006614934486707, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5957227945327759, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8800036907196045, + "num_tokens": 839931694.0, + "step": 22016 + }, + { + "epoch": 2.800788703727261, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6837080717086792, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8840004205703735, + "num_tokens": 839964388.0, + "step": 22017 + }, + { + "epoch": 2.8009159140058517, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.741681456565857, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8678119778633118, + "num_tokens": 840001560.0, + "step": 22018 + }, + { + "epoch": 2.8010431242844422, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4754174947738647, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8770874738693237, + "num_tokens": 840043607.0, + "step": 22019 + }, + { + "epoch": 2.8011703345630328, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5367546081542969, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8733359575271606, + "num_tokens": 840085336.0, + "step": 22020 + }, + { + "epoch": 2.8012975448416233, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7059308290481567, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8767356872558594, + "num_tokens": 840121610.0, + "step": 22021 + }, + { + "epoch": 2.801424755120214, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6092463731765747, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8792347311973572, + "num_tokens": 840158982.0, + "step": 22022 + }, + { + "epoch": 2.8015519653988044, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.487906575202942, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8738487958908081, + "num_tokens": 840203197.0, + "step": 22023 + }, + { + "epoch": 2.801679175677395, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7486984729766846, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8614253401756287, + "num_tokens": 840241773.0, + "step": 22024 + }, + { + "epoch": 2.8018063859559854, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5123372077941895, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8830467462539673, + "num_tokens": 840284054.0, + "step": 22025 + }, + { + "epoch": 2.8019335962345755, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5952513217926025, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8746060132980347, + "num_tokens": 840327082.0, + "step": 22026 + }, + { + "epoch": 2.8020608065131665, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6340686082839966, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8749077320098877, + "num_tokens": 840364191.0, + "step": 22027 + }, + { + "epoch": 2.8021880167917566, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.605434775352478, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8715404272079468, + "num_tokens": 840403833.0, + "step": 22028 + }, + { + "epoch": 2.8023152270703475, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6566457748413086, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8720722198486328, + "num_tokens": 840443751.0, + "step": 22029 + }, + { + "epoch": 2.8024424373489376, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6978611946105957, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.869393527507782, + "num_tokens": 840479004.0, + "step": 22030 + }, + { + "epoch": 2.8025696476275286, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.674277901649475, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8798925876617432, + "num_tokens": 840516827.0, + "step": 22031 + }, + { + "epoch": 2.8026968579061187, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7540574073791504, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8690636157989502, + "num_tokens": 840553810.0, + "step": 22032 + }, + { + "epoch": 2.802824068184709, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.599955677986145, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8746606111526489, + "num_tokens": 840593455.0, + "step": 22033 + }, + { + "epoch": 2.8029512784632997, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.510406494140625, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8920949697494507, + "num_tokens": 840632630.0, + "step": 22034 + }, + { + "epoch": 2.8030784887418903, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6761114597320557, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8723651766777039, + "num_tokens": 840668985.0, + "step": 22035 + }, + { + "epoch": 2.803205699020481, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5624443292617798, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8705498576164246, + "num_tokens": 840710289.0, + "step": 22036 + }, + { + "epoch": 2.8033329092990713, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6130064725875854, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8815406560897827, + "num_tokens": 840747534.0, + "step": 22037 + }, + { + "epoch": 2.803460119577662, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5736680030822754, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8851350545883179, + "num_tokens": 840788299.0, + "step": 22038 + }, + { + "epoch": 2.8035873298562524, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6800179481506348, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8867006301879883, + "num_tokens": 840822591.0, + "step": 22039 + }, + { + "epoch": 2.803714540134843, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8558688163757324, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8659523129463196, + "num_tokens": 840852174.0, + "step": 22040 + }, + { + "epoch": 2.8038417504134334, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5322327613830566, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8956466913223267, + "num_tokens": 840889739.0, + "step": 22041 + }, + { + "epoch": 2.803968960692024, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5452734231948853, + "learning_rate": 1e-06, + "loss": 0.3351, + "mean_token_accuracy": 0.8946525454521179, + "num_tokens": 840929669.0, + "step": 22042 + }, + { + "epoch": 2.8040961709706145, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6123939752578735, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8771361112594604, + "num_tokens": 840971377.0, + "step": 22043 + }, + { + "epoch": 2.804223381249205, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.778921365737915, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8778969645500183, + "num_tokens": 841005868.0, + "step": 22044 + }, + { + "epoch": 2.8043505915277955, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.707643985748291, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8836339712142944, + "num_tokens": 841039393.0, + "step": 22045 + }, + { + "epoch": 2.804477801806386, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7479400634765625, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8776453733444214, + "num_tokens": 841076183.0, + "step": 22046 + }, + { + "epoch": 2.8046050120849766, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.698468565940857, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8667517304420471, + "num_tokens": 841113523.0, + "step": 22047 + }, + { + "epoch": 2.804732222363567, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5847156047821045, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8749887943267822, + "num_tokens": 841153808.0, + "step": 22048 + }, + { + "epoch": 2.8048594326421576, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8645325899124146, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.873323380947113, + "num_tokens": 841184252.0, + "step": 22049 + }, + { + "epoch": 2.804986642920748, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7433043718338013, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8651003241539001, + "num_tokens": 841219140.0, + "step": 22050 + }, + { + "epoch": 2.8051138531993383, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.555238127708435, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8836624622344971, + "num_tokens": 841258935.0, + "step": 22051 + }, + { + "epoch": 2.8052410634779292, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8110346794128418, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8653181195259094, + "num_tokens": 841296757.0, + "step": 22052 + }, + { + "epoch": 2.8053682737565193, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6515452861785889, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8733211755752563, + "num_tokens": 841337894.0, + "step": 22053 + }, + { + "epoch": 2.8054954840351103, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 3.789013147354126, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8613810539245605, + "num_tokens": 841373995.0, + "step": 22054 + }, + { + "epoch": 2.8056226943137004, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.684723973274231, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8685166835784912, + "num_tokens": 841410163.0, + "step": 22055 + }, + { + "epoch": 2.805749904592291, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.760521650314331, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.8831382989883423, + "num_tokens": 841442275.0, + "step": 22056 + }, + { + "epoch": 2.8058771148708814, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5766297578811646, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8788034319877625, + "num_tokens": 841483783.0, + "step": 22057 + }, + { + "epoch": 2.806004325149472, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.618065595626831, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8737286925315857, + "num_tokens": 841524251.0, + "step": 22058 + }, + { + "epoch": 2.8061315354280625, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.637670636177063, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.875594973564148, + "num_tokens": 841568239.0, + "step": 22059 + }, + { + "epoch": 2.806258745706653, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6588294506072998, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8799039125442505, + "num_tokens": 841609432.0, + "step": 22060 + }, + { + "epoch": 2.8063859559852435, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5303393602371216, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8749526143074036, + "num_tokens": 841652448.0, + "step": 22061 + }, + { + "epoch": 2.806513166263834, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.416330337524414, + "learning_rate": 1e-06, + "loss": 0.3229, + "mean_token_accuracy": 0.8974007368087769, + "num_tokens": 841694349.0, + "step": 22062 + }, + { + "epoch": 2.8066403765424246, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5345667600631714, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8848004341125488, + "num_tokens": 841738138.0, + "step": 22063 + }, + { + "epoch": 2.806767586821015, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6602617502212524, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8806939125061035, + "num_tokens": 841776800.0, + "step": 22064 + }, + { + "epoch": 2.8068947970996057, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5126044750213623, + "learning_rate": 1e-06, + "loss": 0.3407, + "mean_token_accuracy": 0.8862543106079102, + "num_tokens": 841813976.0, + "step": 22065 + }, + { + "epoch": 2.807022007378196, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6315828561782837, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8569192886352539, + "num_tokens": 841855922.0, + "step": 22066 + }, + { + "epoch": 2.8071492176567867, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6000900268554688, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8734806776046753, + "num_tokens": 841894173.0, + "step": 22067 + }, + { + "epoch": 2.8072764279353772, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.593291163444519, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8790570497512817, + "num_tokens": 841931777.0, + "step": 22068 + }, + { + "epoch": 2.8074036382139678, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.817880392074585, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8608181476593018, + "num_tokens": 841968186.0, + "step": 22069 + }, + { + "epoch": 2.8075308484925583, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7193235158920288, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8741825819015503, + "num_tokens": 842007875.0, + "step": 22070 + }, + { + "epoch": 2.807658058771149, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7087795734405518, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.859548807144165, + "num_tokens": 842046463.0, + "step": 22071 + }, + { + "epoch": 2.8077852690497394, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.670188069343567, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8702763915061951, + "num_tokens": 842085061.0, + "step": 22072 + }, + { + "epoch": 2.80791247932833, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6334772109985352, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8881546258926392, + "num_tokens": 842122017.0, + "step": 22073 + }, + { + "epoch": 2.80803968960692, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.71467924118042, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8769612908363342, + "num_tokens": 842159565.0, + "step": 22074 + }, + { + "epoch": 2.808166899885511, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6904656887054443, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8673585653305054, + "num_tokens": 842199103.0, + "step": 22075 + }, + { + "epoch": 2.808294110164101, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7782976627349854, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8894122838973999, + "num_tokens": 842233431.0, + "step": 22076 + }, + { + "epoch": 2.808421320442692, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7204475402832031, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8814969658851624, + "num_tokens": 842272398.0, + "step": 22077 + }, + { + "epoch": 2.808548530721282, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5082309246063232, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8751932978630066, + "num_tokens": 842319817.0, + "step": 22078 + }, + { + "epoch": 2.808675740999873, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7809593677520752, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8709196448326111, + "num_tokens": 842352948.0, + "step": 22079 + }, + { + "epoch": 2.808802951278463, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6229227781295776, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.873267650604248, + "num_tokens": 842396990.0, + "step": 22080 + }, + { + "epoch": 2.8089301615570537, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8277772665023804, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.86967933177948, + "num_tokens": 842434681.0, + "step": 22081 + }, + { + "epoch": 2.809057371835644, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7853617668151855, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8769827485084534, + "num_tokens": 842468750.0, + "step": 22082 + }, + { + "epoch": 2.8091845821142347, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6600896120071411, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8773430585861206, + "num_tokens": 842502695.0, + "step": 22083 + }, + { + "epoch": 2.8093117923928252, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.74609375, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8732321858406067, + "num_tokens": 842536573.0, + "step": 22084 + }, + { + "epoch": 2.8094390026714158, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.75644052028656, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8650222420692444, + "num_tokens": 842570753.0, + "step": 22085 + }, + { + "epoch": 2.8095662129500063, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8184987306594849, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8799375295639038, + "num_tokens": 842606871.0, + "step": 22086 + }, + { + "epoch": 2.809693423228597, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6769402027130127, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8751935958862305, + "num_tokens": 842645379.0, + "step": 22087 + }, + { + "epoch": 2.8098206335071874, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.66785728931427, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8524644374847412, + "num_tokens": 842685721.0, + "step": 22088 + }, + { + "epoch": 2.809947843785778, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.614823341369629, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8808397650718689, + "num_tokens": 842724743.0, + "step": 22089 + }, + { + "epoch": 2.8100750540643684, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 3.7784829139709473, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8620389699935913, + "num_tokens": 842761534.0, + "step": 22090 + }, + { + "epoch": 2.810202264342959, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7020245790481567, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8720795512199402, + "num_tokens": 842804887.0, + "step": 22091 + }, + { + "epoch": 2.8103294746215495, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.689447045326233, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8639642000198364, + "num_tokens": 842842756.0, + "step": 22092 + }, + { + "epoch": 2.81045668490014, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6261035203933716, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8817365765571594, + "num_tokens": 842883170.0, + "step": 22093 + }, + { + "epoch": 2.8105838951787305, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8939123153686523, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8711686730384827, + "num_tokens": 842913557.0, + "step": 22094 + }, + { + "epoch": 2.810711105457321, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7737302780151367, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8675050139427185, + "num_tokens": 842949400.0, + "step": 22095 + }, + { + "epoch": 2.8108383157359116, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8614085912704468, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8581802845001221, + "num_tokens": 842987536.0, + "step": 22096 + }, + { + "epoch": 2.810965526014502, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7791383266448975, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8747145533561707, + "num_tokens": 843025621.0, + "step": 22097 + }, + { + "epoch": 2.8110927362930926, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7201616764068604, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8805294036865234, + "num_tokens": 843062572.0, + "step": 22098 + }, + { + "epoch": 2.8112199465716827, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8151942491531372, + "learning_rate": 1e-06, + "loss": 0.3647, + "mean_token_accuracy": 0.8821903467178345, + "num_tokens": 843094966.0, + "step": 22099 + }, + { + "epoch": 2.8113471568502737, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6578253507614136, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8646321296691895, + "num_tokens": 843133193.0, + "step": 22100 + }, + { + "epoch": 2.811474367128864, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6224472522735596, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.878557026386261, + "num_tokens": 843172476.0, + "step": 22101 + }, + { + "epoch": 2.8116015774074548, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7839562892913818, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8644376993179321, + "num_tokens": 843208680.0, + "step": 22102 + }, + { + "epoch": 2.811728787686045, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7983826398849487, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8914637565612793, + "num_tokens": 843244776.0, + "step": 22103 + }, + { + "epoch": 2.811855997964636, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6260508298873901, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8717435002326965, + "num_tokens": 843282585.0, + "step": 22104 + }, + { + "epoch": 2.811983208243226, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6681653261184692, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8598164319992065, + "num_tokens": 843319036.0, + "step": 22105 + }, + { + "epoch": 2.8121104185218164, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.713176965713501, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8773409128189087, + "num_tokens": 843352573.0, + "step": 22106 + }, + { + "epoch": 2.812237628800407, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.540710210800171, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8765529990196228, + "num_tokens": 843392667.0, + "step": 22107 + }, + { + "epoch": 2.8123648390789975, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5325675010681152, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8771851062774658, + "num_tokens": 843436273.0, + "step": 22108 + }, + { + "epoch": 2.812492049357588, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5608150959014893, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8638449907302856, + "num_tokens": 843480576.0, + "step": 22109 + }, + { + "epoch": 2.8126192596361785, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5770223140716553, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8724308609962463, + "num_tokens": 843522587.0, + "step": 22110 + }, + { + "epoch": 2.812746469914769, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8775596618652344, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.871843159198761, + "num_tokens": 843554730.0, + "step": 22111 + }, + { + "epoch": 2.8128736801933596, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7241144180297852, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.865693986415863, + "num_tokens": 843592961.0, + "step": 22112 + }, + { + "epoch": 2.81300089047195, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6665576696395874, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8745655417442322, + "num_tokens": 843629226.0, + "step": 22113 + }, + { + "epoch": 2.8131281007505406, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.571414589881897, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8590839505195618, + "num_tokens": 843674809.0, + "step": 22114 + }, + { + "epoch": 2.813255311029131, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8435909748077393, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8593661785125732, + "num_tokens": 843706863.0, + "step": 22115 + }, + { + "epoch": 2.8133825213077217, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.516422986984253, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8797208070755005, + "num_tokens": 843747915.0, + "step": 22116 + }, + { + "epoch": 2.8135097315863122, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.626744031906128, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8755236268043518, + "num_tokens": 843784982.0, + "step": 22117 + }, + { + "epoch": 2.8136369418649028, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8799816370010376, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8800002336502075, + "num_tokens": 843814458.0, + "step": 22118 + }, + { + "epoch": 2.8137641521434933, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7218804359436035, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.850074291229248, + "num_tokens": 843851107.0, + "step": 22119 + }, + { + "epoch": 2.813891362422084, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5644466876983643, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8789675235748291, + "num_tokens": 843892036.0, + "step": 22120 + }, + { + "epoch": 2.8140185727006743, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5099090337753296, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8751354217529297, + "num_tokens": 843936738.0, + "step": 22121 + }, + { + "epoch": 2.814145782979265, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.611502766609192, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8717212080955505, + "num_tokens": 843977557.0, + "step": 22122 + }, + { + "epoch": 2.8142729932578554, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6709898710250854, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8794581890106201, + "num_tokens": 844013297.0, + "step": 22123 + }, + { + "epoch": 2.8144002035364455, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6275075674057007, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8735586404800415, + "num_tokens": 844051816.0, + "step": 22124 + }, + { + "epoch": 2.8145274138150365, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.495640516281128, + "learning_rate": 1e-06, + "loss": 0.3369, + "mean_token_accuracy": 0.8905633687973022, + "num_tokens": 844090004.0, + "step": 22125 + }, + { + "epoch": 2.8146546240936265, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5742162466049194, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8784637451171875, + "num_tokens": 844130217.0, + "step": 22126 + }, + { + "epoch": 2.8147818343722175, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6691267490386963, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8757731914520264, + "num_tokens": 844166333.0, + "step": 22127 + }, + { + "epoch": 2.8149090446508076, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6678242683410645, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8705135583877563, + "num_tokens": 844200365.0, + "step": 22128 + }, + { + "epoch": 2.815036254929398, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.583856463432312, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8866026401519775, + "num_tokens": 844241011.0, + "step": 22129 + }, + { + "epoch": 2.8151634652079887, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.639580249786377, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8691118955612183, + "num_tokens": 844281658.0, + "step": 22130 + }, + { + "epoch": 2.815290675486579, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6744388341903687, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.875237226486206, + "num_tokens": 844317026.0, + "step": 22131 + }, + { + "epoch": 2.8154178857651697, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.656990885734558, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.889142632484436, + "num_tokens": 844353584.0, + "step": 22132 + }, + { + "epoch": 2.8155450960437602, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.677311658859253, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8759240508079529, + "num_tokens": 844388699.0, + "step": 22133 + }, + { + "epoch": 2.8156723063223508, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6161150932312012, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.881347119808197, + "num_tokens": 844426585.0, + "step": 22134 + }, + { + "epoch": 2.8157995166009413, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6762486696243286, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8626621961593628, + "num_tokens": 844465850.0, + "step": 22135 + }, + { + "epoch": 2.815926726879532, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.573952555656433, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.8887040615081787, + "num_tokens": 844504542.0, + "step": 22136 + }, + { + "epoch": 2.8160539371581224, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.803891658782959, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8520309329032898, + "num_tokens": 844539804.0, + "step": 22137 + }, + { + "epoch": 2.816181147436713, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7093558311462402, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8603177070617676, + "num_tokens": 844575063.0, + "step": 22138 + }, + { + "epoch": 2.8163083577153034, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7601412534713745, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8703284859657288, + "num_tokens": 844610560.0, + "step": 22139 + }, + { + "epoch": 2.816435567993894, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5611557960510254, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8862079381942749, + "num_tokens": 844646989.0, + "step": 22140 + }, + { + "epoch": 2.8165627782724845, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6227136850357056, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.878620982170105, + "num_tokens": 844686575.0, + "step": 22141 + }, + { + "epoch": 2.816689988551075, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6635031700134277, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8778727650642395, + "num_tokens": 844724377.0, + "step": 22142 + }, + { + "epoch": 2.8168171988296655, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5939761400222778, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8775123357772827, + "num_tokens": 844762568.0, + "step": 22143 + }, + { + "epoch": 2.816944409108256, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5457578897476196, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8797003626823425, + "num_tokens": 844802855.0, + "step": 22144 + }, + { + "epoch": 2.8170716193868466, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5410617589950562, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8757150173187256, + "num_tokens": 844846216.0, + "step": 22145 + }, + { + "epoch": 2.817198829665437, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.626473307609558, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8872441649436951, + "num_tokens": 844881222.0, + "step": 22146 + }, + { + "epoch": 2.8173260399440276, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6463227272033691, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8803764581680298, + "num_tokens": 844917433.0, + "step": 22147 + }, + { + "epoch": 2.817453250222618, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7129757404327393, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8607938289642334, + "num_tokens": 844952523.0, + "step": 22148 + }, + { + "epoch": 2.8175804605012083, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6926501989364624, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8780255913734436, + "num_tokens": 844985939.0, + "step": 22149 + }, + { + "epoch": 2.8177076707797992, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6404372453689575, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8859327435493469, + "num_tokens": 845020373.0, + "step": 22150 + }, + { + "epoch": 2.8178348810583893, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5508389472961426, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8705930113792419, + "num_tokens": 845060919.0, + "step": 22151 + }, + { + "epoch": 2.8179620913369803, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.564324975013733, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8903869986534119, + "num_tokens": 845097664.0, + "step": 22152 + }, + { + "epoch": 2.8180893016155704, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5792570114135742, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8858810663223267, + "num_tokens": 845131820.0, + "step": 22153 + }, + { + "epoch": 2.818216511894161, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6500517129898071, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8772412538528442, + "num_tokens": 845168504.0, + "step": 22154 + }, + { + "epoch": 2.8183437221727514, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7179306745529175, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8846563100814819, + "num_tokens": 845201661.0, + "step": 22155 + }, + { + "epoch": 2.818470932451342, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7326363325119019, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8701415061950684, + "num_tokens": 845235769.0, + "step": 22156 + }, + { + "epoch": 2.8185981427299325, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.724113941192627, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.881049633026123, + "num_tokens": 845267448.0, + "step": 22157 + }, + { + "epoch": 2.818725353008523, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7803285121917725, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8627257347106934, + "num_tokens": 845304848.0, + "step": 22158 + }, + { + "epoch": 2.8188525632871135, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5520681142807007, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.863724946975708, + "num_tokens": 845349006.0, + "step": 22159 + }, + { + "epoch": 2.818979773565704, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6492961645126343, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8771536350250244, + "num_tokens": 845385936.0, + "step": 22160 + }, + { + "epoch": 2.8191069838442946, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7903560400009155, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8663063049316406, + "num_tokens": 845424359.0, + "step": 22161 + }, + { + "epoch": 2.819234194122885, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6583980321884155, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8698407411575317, + "num_tokens": 845463338.0, + "step": 22162 + }, + { + "epoch": 2.8193614044014756, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.613898515701294, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8758710026741028, + "num_tokens": 845501029.0, + "step": 22163 + }, + { + "epoch": 2.819488614680066, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6771540641784668, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8561245203018188, + "num_tokens": 845541742.0, + "step": 22164 + }, + { + "epoch": 2.8196158249586567, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6729644536972046, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8771272897720337, + "num_tokens": 845578155.0, + "step": 22165 + }, + { + "epoch": 2.8197430352372472, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.538539171218872, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8808046579360962, + "num_tokens": 845615377.0, + "step": 22166 + }, + { + "epoch": 2.8198702455158378, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5592867136001587, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8837646842002869, + "num_tokens": 845656081.0, + "step": 22167 + }, + { + "epoch": 2.8199974557944283, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.650085210800171, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8741673231124878, + "num_tokens": 845688652.0, + "step": 22168 + }, + { + "epoch": 2.820124666073019, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6271556615829468, + "learning_rate": 1e-06, + "loss": 0.3455, + "mean_token_accuracy": 0.8866415023803711, + "num_tokens": 845731195.0, + "step": 22169 + }, + { + "epoch": 2.8202518763516093, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.685045599937439, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8885530829429626, + "num_tokens": 845763168.0, + "step": 22170 + }, + { + "epoch": 2.8203790866302, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.521562933921814, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8708864450454712, + "num_tokens": 845808316.0, + "step": 22171 + }, + { + "epoch": 2.82050629690879, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6539878845214844, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8671978712081909, + "num_tokens": 845847426.0, + "step": 22172 + }, + { + "epoch": 2.820633507187381, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6012946367263794, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8818488121032715, + "num_tokens": 845884469.0, + "step": 22173 + }, + { + "epoch": 2.820760717465971, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6745558977127075, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8716747760772705, + "num_tokens": 845923739.0, + "step": 22174 + }, + { + "epoch": 2.820887927744562, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4724061489105225, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8854501247406006, + "num_tokens": 845968550.0, + "step": 22175 + }, + { + "epoch": 2.821015138023152, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.80362069606781, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8721364736557007, + "num_tokens": 846003705.0, + "step": 22176 + }, + { + "epoch": 2.821142348301743, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6637464761734009, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8697333931922913, + "num_tokens": 846044341.0, + "step": 22177 + }, + { + "epoch": 2.821269558580333, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7960911989212036, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8800665736198425, + "num_tokens": 846076102.0, + "step": 22178 + }, + { + "epoch": 2.8213967688589237, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6076104640960693, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8778898119926453, + "num_tokens": 846116268.0, + "step": 22179 + }, + { + "epoch": 2.821523979137514, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.593745470046997, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.879680871963501, + "num_tokens": 846155854.0, + "step": 22180 + }, + { + "epoch": 2.8216511894161047, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5166778564453125, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8880664706230164, + "num_tokens": 846198271.0, + "step": 22181 + }, + { + "epoch": 2.8217783996946952, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7440388202667236, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8736487627029419, + "num_tokens": 846236433.0, + "step": 22182 + }, + { + "epoch": 2.8219056099732858, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6997044086456299, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8834713697433472, + "num_tokens": 846274561.0, + "step": 22183 + }, + { + "epoch": 2.8220328202518763, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8474829196929932, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8746380805969238, + "num_tokens": 846308910.0, + "step": 22184 + }, + { + "epoch": 2.822160030530467, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.812276005744934, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.876896858215332, + "num_tokens": 846339432.0, + "step": 22185 + }, + { + "epoch": 2.8222872408090574, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7793283462524414, + "learning_rate": 1e-06, + "loss": 0.3379, + "mean_token_accuracy": 0.8894222974777222, + "num_tokens": 846377681.0, + "step": 22186 + }, + { + "epoch": 2.822414451087648, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7927064895629883, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8698523044586182, + "num_tokens": 846411733.0, + "step": 22187 + }, + { + "epoch": 2.8225416613662384, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5839284658432007, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8820345401763916, + "num_tokens": 846449208.0, + "step": 22188 + }, + { + "epoch": 2.822668871644829, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5153377056121826, + "learning_rate": 1e-06, + "loss": 0.3399, + "mean_token_accuracy": 0.8884903788566589, + "num_tokens": 846488070.0, + "step": 22189 + }, + { + "epoch": 2.8227960819234195, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6363470554351807, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8784375786781311, + "num_tokens": 846527289.0, + "step": 22190 + }, + { + "epoch": 2.82292329220201, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6330602169036865, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.888119637966156, + "num_tokens": 846567469.0, + "step": 22191 + }, + { + "epoch": 2.8230505024806005, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6272013187408447, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8776543140411377, + "num_tokens": 846604960.0, + "step": 22192 + }, + { + "epoch": 2.823177712759191, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7795789241790771, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8752624988555908, + "num_tokens": 846638323.0, + "step": 22193 + }, + { + "epoch": 2.8233049230377816, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4801815748214722, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8775576949119568, + "num_tokens": 846679084.0, + "step": 22194 + }, + { + "epoch": 2.823432133316372, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6768484115600586, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.861430823802948, + "num_tokens": 846718722.0, + "step": 22195 + }, + { + "epoch": 2.8235593435949626, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5260157585144043, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8763052821159363, + "num_tokens": 846759124.0, + "step": 22196 + }, + { + "epoch": 2.8236865538735527, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5471867322921753, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8713521957397461, + "num_tokens": 846799752.0, + "step": 22197 + }, + { + "epoch": 2.8238137641521437, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9192936420440674, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8698369264602661, + "num_tokens": 846830045.0, + "step": 22198 + }, + { + "epoch": 2.8239409744307338, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6063637733459473, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8604612946510315, + "num_tokens": 846869093.0, + "step": 22199 + }, + { + "epoch": 2.8240681847093247, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.734782338142395, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8789883255958557, + "num_tokens": 846906970.0, + "step": 22200 + }, + { + "epoch": 2.824195394987915, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6422089338302612, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8821456432342529, + "num_tokens": 846943922.0, + "step": 22201 + }, + { + "epoch": 2.824322605266506, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7136286497116089, + "learning_rate": 1e-06, + "loss": 0.3321, + "mean_token_accuracy": 0.8915319442749023, + "num_tokens": 846975681.0, + "step": 22202 + }, + { + "epoch": 2.824449815545096, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5079965591430664, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8888247013092041, + "num_tokens": 847013308.0, + "step": 22203 + }, + { + "epoch": 2.8245770258236864, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5939650535583496, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8848938941955566, + "num_tokens": 847053685.0, + "step": 22204 + }, + { + "epoch": 2.824704236102277, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6648296117782593, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8825664520263672, + "num_tokens": 847093757.0, + "step": 22205 + }, + { + "epoch": 2.8248314463808675, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7622063159942627, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8741742968559265, + "num_tokens": 847127899.0, + "step": 22206 + }, + { + "epoch": 2.824958656659458, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6472585201263428, + "learning_rate": 1e-06, + "loss": 0.3408, + "mean_token_accuracy": 0.8889925479888916, + "num_tokens": 847162625.0, + "step": 22207 + }, + { + "epoch": 2.8250858669380485, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6113065481185913, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.879163384437561, + "num_tokens": 847202083.0, + "step": 22208 + }, + { + "epoch": 2.825213077216639, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6182634830474854, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8648264408111572, + "num_tokens": 847242720.0, + "step": 22209 + }, + { + "epoch": 2.8253402874952296, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7022087574005127, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8774880170822144, + "num_tokens": 847280250.0, + "step": 22210 + }, + { + "epoch": 2.82546749777382, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5077869892120361, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8767637014389038, + "num_tokens": 847322891.0, + "step": 22211 + }, + { + "epoch": 2.8255947080524106, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6450427770614624, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8755452036857605, + "num_tokens": 847362134.0, + "step": 22212 + }, + { + "epoch": 2.825721918331001, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5491504669189453, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8758755922317505, + "num_tokens": 847405271.0, + "step": 22213 + }, + { + "epoch": 2.8258491286095917, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.593994140625, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8723661303520203, + "num_tokens": 847447156.0, + "step": 22214 + }, + { + "epoch": 2.8259763388881822, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5073193311691284, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8770396113395691, + "num_tokens": 847486868.0, + "step": 22215 + }, + { + "epoch": 2.8261035491667728, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7326253652572632, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8679815530776978, + "num_tokens": 847521453.0, + "step": 22216 + }, + { + "epoch": 2.8262307594453633, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4894925355911255, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8835359811782837, + "num_tokens": 847562873.0, + "step": 22217 + }, + { + "epoch": 2.826357969723954, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7021327018737793, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8785836100578308, + "num_tokens": 847600312.0, + "step": 22218 + }, + { + "epoch": 2.8264851800025443, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7133694887161255, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8687810897827148, + "num_tokens": 847639800.0, + "step": 22219 + }, + { + "epoch": 2.826612390281135, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.449574589729309, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8842518329620361, + "num_tokens": 847683071.0, + "step": 22220 + }, + { + "epoch": 2.8267396005597254, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6774520874023438, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8785051107406616, + "num_tokens": 847720783.0, + "step": 22221 + }, + { + "epoch": 2.8268668108383155, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.631640911102295, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8784681558609009, + "num_tokens": 847755567.0, + "step": 22222 + }, + { + "epoch": 2.8269940211169065, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6672303676605225, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8901021480560303, + "num_tokens": 847791927.0, + "step": 22223 + }, + { + "epoch": 2.8271212313954965, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5252958536148071, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8775966763496399, + "num_tokens": 847832001.0, + "step": 22224 + }, + { + "epoch": 2.8272484416740875, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6885205507278442, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8783180713653564, + "num_tokens": 847867304.0, + "step": 22225 + }, + { + "epoch": 2.8273756519526776, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6990668773651123, + "learning_rate": 1e-06, + "loss": 0.3672, + "mean_token_accuracy": 0.8840849995613098, + "num_tokens": 847899654.0, + "step": 22226 + }, + { + "epoch": 2.827502862231268, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5910406112670898, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8791429400444031, + "num_tokens": 847936038.0, + "step": 22227 + }, + { + "epoch": 2.8276300725098586, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4666662216186523, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8777599334716797, + "num_tokens": 847979016.0, + "step": 22228 + }, + { + "epoch": 2.827757282788449, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.558488368988037, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8809912204742432, + "num_tokens": 848016945.0, + "step": 22229 + }, + { + "epoch": 2.8278844930670397, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6667665243148804, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.872735857963562, + "num_tokens": 848054591.0, + "step": 22230 + }, + { + "epoch": 2.8280117033456302, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.693345069885254, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8701238036155701, + "num_tokens": 848094976.0, + "step": 22231 + }, + { + "epoch": 2.8281389136242208, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.78932523727417, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8694454431533813, + "num_tokens": 848131916.0, + "step": 22232 + }, + { + "epoch": 2.8282661239028113, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8253766298294067, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8916668891906738, + "num_tokens": 848160606.0, + "step": 22233 + }, + { + "epoch": 2.828393334181402, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.687666893005371, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8724744319915771, + "num_tokens": 848200874.0, + "step": 22234 + }, + { + "epoch": 2.8285205444599923, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6121937036514282, + "learning_rate": 1e-06, + "loss": 0.3386, + "mean_token_accuracy": 0.8889310359954834, + "num_tokens": 848238964.0, + "step": 22235 + }, + { + "epoch": 2.828647754738583, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6978504657745361, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8750086426734924, + "num_tokens": 848276212.0, + "step": 22236 + }, + { + "epoch": 2.8287749650171734, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7125965356826782, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8668946027755737, + "num_tokens": 848316314.0, + "step": 22237 + }, + { + "epoch": 2.828902175295764, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.727247953414917, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8837682008743286, + "num_tokens": 848350562.0, + "step": 22238 + }, + { + "epoch": 2.8290293855743545, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6197359561920166, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8628090620040894, + "num_tokens": 848391457.0, + "step": 22239 + }, + { + "epoch": 2.829156595852945, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5324406623840332, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8787721395492554, + "num_tokens": 848432779.0, + "step": 22240 + }, + { + "epoch": 2.8292838061315355, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6903460025787354, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8723852634429932, + "num_tokens": 848467821.0, + "step": 22241 + }, + { + "epoch": 2.829411016410126, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5650062561035156, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8874561786651611, + "num_tokens": 848507874.0, + "step": 22242 + }, + { + "epoch": 2.8295382266887166, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.617826223373413, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.882743239402771, + "num_tokens": 848544287.0, + "step": 22243 + }, + { + "epoch": 2.829665436967307, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7248809337615967, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8836644887924194, + "num_tokens": 848577303.0, + "step": 22244 + }, + { + "epoch": 2.8297926472458976, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6085829734802246, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8821113705635071, + "num_tokens": 848615483.0, + "step": 22245 + }, + { + "epoch": 2.829919857524488, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4704548120498657, + "learning_rate": 1e-06, + "loss": 0.342, + "mean_token_accuracy": 0.8895247578620911, + "num_tokens": 848655908.0, + "step": 22246 + }, + { + "epoch": 2.8300470678030782, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6932767629623413, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8701879978179932, + "num_tokens": 848700065.0, + "step": 22247 + }, + { + "epoch": 2.830174278081669, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7585772275924683, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8762895464897156, + "num_tokens": 848735023.0, + "step": 22248 + }, + { + "epoch": 2.8303014883602593, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7164974212646484, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8683040142059326, + "num_tokens": 848773235.0, + "step": 22249 + }, + { + "epoch": 2.8304286986388503, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.529924988746643, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8748574256896973, + "num_tokens": 848814617.0, + "step": 22250 + }, + { + "epoch": 2.8305559089174404, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6447492837905884, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8768650889396667, + "num_tokens": 848851539.0, + "step": 22251 + }, + { + "epoch": 2.830683119196031, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6549447774887085, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8830617666244507, + "num_tokens": 848885659.0, + "step": 22252 + }, + { + "epoch": 2.8308103294746214, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6169178485870361, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8578941226005554, + "num_tokens": 848924341.0, + "step": 22253 + }, + { + "epoch": 2.830937539753212, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6505939960479736, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8738965392112732, + "num_tokens": 848958824.0, + "step": 22254 + }, + { + "epoch": 2.8310647500318025, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6217411756515503, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8751903176307678, + "num_tokens": 848998432.0, + "step": 22255 + }, + { + "epoch": 2.831191960310393, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7700852155685425, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8874556422233582, + "num_tokens": 849034738.0, + "step": 22256 + }, + { + "epoch": 2.8313191705889835, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5921454429626465, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8756380081176758, + "num_tokens": 849072148.0, + "step": 22257 + }, + { + "epoch": 2.831446380867574, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5903091430664062, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8854075074195862, + "num_tokens": 849107946.0, + "step": 22258 + }, + { + "epoch": 2.8315735911461646, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.703612208366394, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8710697889328003, + "num_tokens": 849143024.0, + "step": 22259 + }, + { + "epoch": 2.831700801424755, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4904826879501343, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8705455660820007, + "num_tokens": 849186545.0, + "step": 22260 + }, + { + "epoch": 2.8318280117033456, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5158966779708862, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8842780590057373, + "num_tokens": 849228075.0, + "step": 22261 + }, + { + "epoch": 2.831955221981936, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6878687143325806, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8891218900680542, + "num_tokens": 849264088.0, + "step": 22262 + }, + { + "epoch": 2.8320824322605267, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 2.0644872188568115, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8794859051704407, + "num_tokens": 849296719.0, + "step": 22263 + }, + { + "epoch": 2.8322096425391172, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6089857816696167, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8699989318847656, + "num_tokens": 849336358.0, + "step": 22264 + }, + { + "epoch": 2.8323368528177078, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6058471202850342, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8869996070861816, + "num_tokens": 849371471.0, + "step": 22265 + }, + { + "epoch": 2.8324640630962983, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6904183626174927, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8694700002670288, + "num_tokens": 849408122.0, + "step": 22266 + }, + { + "epoch": 2.832591273374889, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5710275173187256, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8698895573616028, + "num_tokens": 849446503.0, + "step": 22267 + }, + { + "epoch": 2.8327184836534793, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6223112344741821, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.886472225189209, + "num_tokens": 849484609.0, + "step": 22268 + }, + { + "epoch": 2.83284569393207, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5158761739730835, + "learning_rate": 1e-06, + "loss": 0.3294, + "mean_token_accuracy": 0.89565110206604, + "num_tokens": 849522039.0, + "step": 22269 + }, + { + "epoch": 2.83297290421066, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7598515748977661, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8656865358352661, + "num_tokens": 849557117.0, + "step": 22270 + }, + { + "epoch": 2.833100114489251, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7692532539367676, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.848040759563446, + "num_tokens": 849596011.0, + "step": 22271 + }, + { + "epoch": 2.833227324767841, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6654975414276123, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8595664501190186, + "num_tokens": 849636444.0, + "step": 22272 + }, + { + "epoch": 2.833354535046432, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6085108518600464, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8771877884864807, + "num_tokens": 849677574.0, + "step": 22273 + }, + { + "epoch": 2.833481745325022, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5887948274612427, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8662840127944946, + "num_tokens": 849717754.0, + "step": 22274 + }, + { + "epoch": 2.833608955603613, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.512941598892212, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8601797223091125, + "num_tokens": 849761787.0, + "step": 22275 + }, + { + "epoch": 2.833736165882203, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.869704008102417, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8746630549430847, + "num_tokens": 849791538.0, + "step": 22276 + }, + { + "epoch": 2.8338633761607936, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5724260807037354, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8756871223449707, + "num_tokens": 849833748.0, + "step": 22277 + }, + { + "epoch": 2.833990586439384, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7397421598434448, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8738287687301636, + "num_tokens": 849868294.0, + "step": 22278 + }, + { + "epoch": 2.8341177967179747, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7083662748336792, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8789377212524414, + "num_tokens": 849903313.0, + "step": 22279 + }, + { + "epoch": 2.8342450069965652, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6390044689178467, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.867341160774231, + "num_tokens": 849942775.0, + "step": 22280 + }, + { + "epoch": 2.8343722172751558, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6112455129623413, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8741259574890137, + "num_tokens": 849981801.0, + "step": 22281 + }, + { + "epoch": 2.8344994275537463, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7329672574996948, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8837706446647644, + "num_tokens": 850013352.0, + "step": 22282 + }, + { + "epoch": 2.834626637832337, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6292403936386108, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.886945366859436, + "num_tokens": 850048036.0, + "step": 22283 + }, + { + "epoch": 2.8347538481109273, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.510766625404358, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8561963438987732, + "num_tokens": 850094853.0, + "step": 22284 + }, + { + "epoch": 2.834881058389518, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4728602170944214, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8780835270881653, + "num_tokens": 850140010.0, + "step": 22285 + }, + { + "epoch": 2.8350082686681084, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4734498262405396, + "learning_rate": 1e-06, + "loss": 0.3348, + "mean_token_accuracy": 0.8903627395629883, + "num_tokens": 850182423.0, + "step": 22286 + }, + { + "epoch": 2.835135478946699, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.719817042350769, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8741692304611206, + "num_tokens": 850216375.0, + "step": 22287 + }, + { + "epoch": 2.8352626892252895, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5387766361236572, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8722388744354248, + "num_tokens": 850259133.0, + "step": 22288 + }, + { + "epoch": 2.83538989950388, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6201010942459106, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8794591426849365, + "num_tokens": 850300873.0, + "step": 22289 + }, + { + "epoch": 2.8355171097824705, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6403160095214844, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8765868544578552, + "num_tokens": 850339365.0, + "step": 22290 + }, + { + "epoch": 2.835644320061061, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5681664943695068, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8792738318443298, + "num_tokens": 850378242.0, + "step": 22291 + }, + { + "epoch": 2.8357715303396516, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6192044019699097, + "learning_rate": 1e-06, + "loss": 0.3384, + "mean_token_accuracy": 0.8884015679359436, + "num_tokens": 850411564.0, + "step": 22292 + }, + { + "epoch": 2.835898740618242, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.596373200416565, + "learning_rate": 1e-06, + "loss": 0.3487, + "mean_token_accuracy": 0.8834571242332458, + "num_tokens": 850447111.0, + "step": 22293 + }, + { + "epoch": 2.8360259508968326, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7678730487823486, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.878783643245697, + "num_tokens": 850479191.0, + "step": 22294 + }, + { + "epoch": 2.8361531611754227, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6623995304107666, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8764603137969971, + "num_tokens": 850517701.0, + "step": 22295 + }, + { + "epoch": 2.8362803714540137, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.687932014465332, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8725727796554565, + "num_tokens": 850556106.0, + "step": 22296 + }, + { + "epoch": 2.8364075817326038, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7038161754608154, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8820092082023621, + "num_tokens": 850591916.0, + "step": 22297 + }, + { + "epoch": 2.8365347920111947, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6911460161209106, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8809177875518799, + "num_tokens": 850626520.0, + "step": 22298 + }, + { + "epoch": 2.836662002289785, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 3.8481862545013428, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8722927570343018, + "num_tokens": 850662119.0, + "step": 22299 + }, + { + "epoch": 2.836789212568376, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6725449562072754, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8772375583648682, + "num_tokens": 850697793.0, + "step": 22300 + }, + { + "epoch": 2.836916422846966, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8515056371688843, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8673933744430542, + "num_tokens": 850733390.0, + "step": 22301 + }, + { + "epoch": 2.8370436331255564, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6218135356903076, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8776854276657104, + "num_tokens": 850774706.0, + "step": 22302 + }, + { + "epoch": 2.837170843404147, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7409547567367554, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8510349988937378, + "num_tokens": 850814528.0, + "step": 22303 + }, + { + "epoch": 2.8372980536827375, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7934911251068115, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8752497434616089, + "num_tokens": 850845551.0, + "step": 22304 + }, + { + "epoch": 2.837425263961328, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6688003540039062, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8743215799331665, + "num_tokens": 850883943.0, + "step": 22305 + }, + { + "epoch": 2.8375524742399185, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5518815517425537, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8644053339958191, + "num_tokens": 850929194.0, + "step": 22306 + }, + { + "epoch": 2.837679684518509, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.753982424736023, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8673957586288452, + "num_tokens": 850962703.0, + "step": 22307 + }, + { + "epoch": 2.8378068947970996, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6168946027755737, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8726904988288879, + "num_tokens": 851000809.0, + "step": 22308 + }, + { + "epoch": 2.83793410507569, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7351248264312744, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8758854866027832, + "num_tokens": 851032777.0, + "step": 22309 + }, + { + "epoch": 2.8380613153542806, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6337023973464966, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8739564418792725, + "num_tokens": 851069112.0, + "step": 22310 + }, + { + "epoch": 2.838188525632871, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5081372261047363, + "learning_rate": 1e-06, + "loss": 0.3323, + "mean_token_accuracy": 0.8905482292175293, + "num_tokens": 851109040.0, + "step": 22311 + }, + { + "epoch": 2.8383157359114617, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6830047369003296, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8754352331161499, + "num_tokens": 851146609.0, + "step": 22312 + }, + { + "epoch": 2.838442946190052, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6420615911483765, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8822653889656067, + "num_tokens": 851182722.0, + "step": 22313 + }, + { + "epoch": 2.8385701564686427, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6874274015426636, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.88160240650177, + "num_tokens": 851218027.0, + "step": 22314 + }, + { + "epoch": 2.8386973667472333, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5479692220687866, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8638384938240051, + "num_tokens": 851264563.0, + "step": 22315 + }, + { + "epoch": 2.838824577025824, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6160753965377808, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8749524354934692, + "num_tokens": 851305086.0, + "step": 22316 + }, + { + "epoch": 2.8389517873044143, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.770155429840088, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8623046875, + "num_tokens": 851339924.0, + "step": 22317 + }, + { + "epoch": 2.839078997583005, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.556215524673462, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8671134114265442, + "num_tokens": 851381401.0, + "step": 22318 + }, + { + "epoch": 2.8392062078615954, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5655039548873901, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8797137141227722, + "num_tokens": 851420273.0, + "step": 22319 + }, + { + "epoch": 2.8393334181401855, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7657984495162964, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.867087721824646, + "num_tokens": 851451570.0, + "step": 22320 + }, + { + "epoch": 2.8394606284187764, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7207211256027222, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.872107744216919, + "num_tokens": 851489475.0, + "step": 22321 + }, + { + "epoch": 2.8395878386973665, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4532229900360107, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.8866175413131714, + "num_tokens": 851533025.0, + "step": 22322 + }, + { + "epoch": 2.8397150489759575, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8904845714569092, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8625730276107788, + "num_tokens": 851568292.0, + "step": 22323 + }, + { + "epoch": 2.8398422592545476, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7253836393356323, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8739675283432007, + "num_tokens": 851606361.0, + "step": 22324 + }, + { + "epoch": 2.839969469533138, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.491001844406128, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8856900930404663, + "num_tokens": 851647881.0, + "step": 22325 + }, + { + "epoch": 2.8400966798117286, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6085662841796875, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8808804154396057, + "num_tokens": 851683298.0, + "step": 22326 + }, + { + "epoch": 2.840223890090319, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.540734887123108, + "learning_rate": 1e-06, + "loss": 0.3465, + "mean_token_accuracy": 0.8850682973861694, + "num_tokens": 851721640.0, + "step": 22327 + }, + { + "epoch": 2.8403511003689097, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8230923414230347, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.878472089767456, + "num_tokens": 851755380.0, + "step": 22328 + }, + { + "epoch": 2.8404783106475002, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8171814680099487, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8649929761886597, + "num_tokens": 851793148.0, + "step": 22329 + }, + { + "epoch": 2.8406055209260908, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5232380628585815, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8759312629699707, + "num_tokens": 851833446.0, + "step": 22330 + }, + { + "epoch": 2.8407327312046813, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6121066808700562, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8737698197364807, + "num_tokens": 851874468.0, + "step": 22331 + }, + { + "epoch": 2.840859941483272, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6328715085983276, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8762071132659912, + "num_tokens": 851911235.0, + "step": 22332 + }, + { + "epoch": 2.8409871517618623, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.785333514213562, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8683987855911255, + "num_tokens": 851946007.0, + "step": 22333 + }, + { + "epoch": 2.841114362040453, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5764330625534058, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.880084753036499, + "num_tokens": 851984363.0, + "step": 22334 + }, + { + "epoch": 2.8412415723190434, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6435997486114502, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8740290403366089, + "num_tokens": 852023433.0, + "step": 22335 + }, + { + "epoch": 2.841368782597634, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6935185194015503, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8830819129943848, + "num_tokens": 852057764.0, + "step": 22336 + }, + { + "epoch": 2.8414959928762245, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7485284805297852, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8653417825698853, + "num_tokens": 852097661.0, + "step": 22337 + }, + { + "epoch": 2.841623203154815, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.685730218887329, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8574876189231873, + "num_tokens": 852140771.0, + "step": 22338 + }, + { + "epoch": 2.8417504134334055, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7410937547683716, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8615566492080688, + "num_tokens": 852178487.0, + "step": 22339 + }, + { + "epoch": 2.841877623711996, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6231223344802856, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8717116117477417, + "num_tokens": 852214360.0, + "step": 22340 + }, + { + "epoch": 2.8420048339905866, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.686799168586731, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8895928859710693, + "num_tokens": 852251456.0, + "step": 22341 + }, + { + "epoch": 2.842132044269177, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.615607738494873, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8751380443572998, + "num_tokens": 852292298.0, + "step": 22342 + }, + { + "epoch": 2.8422592545477676, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7770520448684692, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8724360466003418, + "num_tokens": 852331864.0, + "step": 22343 + }, + { + "epoch": 2.842386464826358, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5222835540771484, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8854584693908691, + "num_tokens": 852370448.0, + "step": 22344 + }, + { + "epoch": 2.8425136751049482, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.608504295349121, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8691847324371338, + "num_tokens": 852409964.0, + "step": 22345 + }, + { + "epoch": 2.842640885383539, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6807113885879517, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8603588342666626, + "num_tokens": 852447287.0, + "step": 22346 + }, + { + "epoch": 2.8427680956621293, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7163238525390625, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8737015128135681, + "num_tokens": 852482610.0, + "step": 22347 + }, + { + "epoch": 2.8428953059407203, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6543959379196167, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8706343770027161, + "num_tokens": 852519805.0, + "step": 22348 + }, + { + "epoch": 2.8430225162193103, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4466181993484497, + "learning_rate": 1e-06, + "loss": 0.3377, + "mean_token_accuracy": 0.8914011716842651, + "num_tokens": 852562069.0, + "step": 22349 + }, + { + "epoch": 2.843149726497901, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5586763620376587, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8716021776199341, + "num_tokens": 852604808.0, + "step": 22350 + }, + { + "epoch": 2.8432769367764914, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6852110624313354, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8576738834381104, + "num_tokens": 852641601.0, + "step": 22351 + }, + { + "epoch": 2.843404147055082, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6287888288497925, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8764231204986572, + "num_tokens": 852682000.0, + "step": 22352 + }, + { + "epoch": 2.8435313573336725, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7118453979492188, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8699403405189514, + "num_tokens": 852722582.0, + "step": 22353 + }, + { + "epoch": 2.843658567612263, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6772973537445068, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8701518774032593, + "num_tokens": 852763397.0, + "step": 22354 + }, + { + "epoch": 2.8437857778908535, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.783277988433838, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8659394383430481, + "num_tokens": 852802345.0, + "step": 22355 + }, + { + "epoch": 2.843912988169444, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6340042352676392, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8650259971618652, + "num_tokens": 852839927.0, + "step": 22356 + }, + { + "epoch": 2.8440401984480346, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.637194037437439, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8649861812591553, + "num_tokens": 852881639.0, + "step": 22357 + }, + { + "epoch": 2.844167408726625, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.571626901626587, + "learning_rate": 1e-06, + "loss": 0.3253, + "mean_token_accuracy": 0.8943026661872864, + "num_tokens": 852916725.0, + "step": 22358 + }, + { + "epoch": 2.8442946190052156, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7113522291183472, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8699688911437988, + "num_tokens": 852954326.0, + "step": 22359 + }, + { + "epoch": 2.844421829283806, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.588281512260437, + "learning_rate": 1e-06, + "loss": 0.3343, + "mean_token_accuracy": 0.8929264545440674, + "num_tokens": 852991237.0, + "step": 22360 + }, + { + "epoch": 2.8445490395623967, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9504609107971191, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.863027811050415, + "num_tokens": 853020517.0, + "step": 22361 + }, + { + "epoch": 2.844676249840987, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.638991355895996, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8762582540512085, + "num_tokens": 853064329.0, + "step": 22362 + }, + { + "epoch": 2.8448034601195777, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.739903211593628, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8615599870681763, + "num_tokens": 853099171.0, + "step": 22363 + }, + { + "epoch": 2.8449306703981683, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5451412200927734, + "learning_rate": 1e-06, + "loss": 0.3342, + "mean_token_accuracy": 0.8901723027229309, + "num_tokens": 853136437.0, + "step": 22364 + }, + { + "epoch": 2.845057880676759, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 2.0225436687469482, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8767091631889343, + "num_tokens": 853170078.0, + "step": 22365 + }, + { + "epoch": 2.8451850909553493, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.567091703414917, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8755661249160767, + "num_tokens": 853213213.0, + "step": 22366 + }, + { + "epoch": 2.84531230123394, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.547121524810791, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8743281364440918, + "num_tokens": 853258201.0, + "step": 22367 + }, + { + "epoch": 2.84543951151253, + "ewc_loss": 2.9921531677246094e-05, + "grad_norm": 16.57193946838379, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.879335343837738, + "num_tokens": 853298436.0, + "step": 22368 + }, + { + "epoch": 2.845566721791121, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6874444484710693, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8775742053985596, + "num_tokens": 853338691.0, + "step": 22369 + }, + { + "epoch": 2.845693932069711, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7115246057510376, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8699074983596802, + "num_tokens": 853378998.0, + "step": 22370 + }, + { + "epoch": 2.845821142348302, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5272817611694336, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.865675151348114, + "num_tokens": 853422736.0, + "step": 22371 + }, + { + "epoch": 2.845948352626892, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.581642508506775, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8718330264091492, + "num_tokens": 853462468.0, + "step": 22372 + }, + { + "epoch": 2.846075562905483, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8097422122955322, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.872307538986206, + "num_tokens": 853493067.0, + "step": 22373 + }, + { + "epoch": 2.846202773184073, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5791370868682861, + "learning_rate": 1e-06, + "loss": 0.3258, + "mean_token_accuracy": 0.8938130140304565, + "num_tokens": 853531526.0, + "step": 22374 + }, + { + "epoch": 2.8463299834626636, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.548430323600769, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8698155283927917, + "num_tokens": 853572567.0, + "step": 22375 + }, + { + "epoch": 2.846457193741254, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6967343091964722, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8824180364608765, + "num_tokens": 853607311.0, + "step": 22376 + }, + { + "epoch": 2.8465844040198447, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7529748678207397, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8633146286010742, + "num_tokens": 853644333.0, + "step": 22377 + }, + { + "epoch": 2.8467116142984352, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7558481693267822, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8759549260139465, + "num_tokens": 853682435.0, + "step": 22378 + }, + { + "epoch": 2.8468388245770258, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.739284873008728, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8520545363426208, + "num_tokens": 853723860.0, + "step": 22379 + }, + { + "epoch": 2.8469660348556163, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5129029750823975, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8723612427711487, + "num_tokens": 853766724.0, + "step": 22380 + }, + { + "epoch": 2.847093245134207, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7270313501358032, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8578415513038635, + "num_tokens": 853804466.0, + "step": 22381 + }, + { + "epoch": 2.8472204554127973, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7006757259368896, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8742895126342773, + "num_tokens": 853842152.0, + "step": 22382 + }, + { + "epoch": 2.847347665691388, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7627660036087036, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8753447532653809, + "num_tokens": 853874597.0, + "step": 22383 + }, + { + "epoch": 2.8474748759699784, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.627976894378662, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8627638816833496, + "num_tokens": 853917643.0, + "step": 22384 + }, + { + "epoch": 2.847602086248569, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.612454891204834, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8753847479820251, + "num_tokens": 853957253.0, + "step": 22385 + }, + { + "epoch": 2.8477292965271594, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5806368589401245, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8736061453819275, + "num_tokens": 853998025.0, + "step": 22386 + }, + { + "epoch": 2.84785650680575, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7425638437271118, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8653754591941833, + "num_tokens": 854038063.0, + "step": 22387 + }, + { + "epoch": 2.8479837170843405, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8031885623931885, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8615835309028625, + "num_tokens": 854074293.0, + "step": 22388 + }, + { + "epoch": 2.848110927362931, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7007503509521484, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8753480911254883, + "num_tokens": 854109079.0, + "step": 22389 + }, + { + "epoch": 2.8482381376415216, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5875585079193115, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8768481016159058, + "num_tokens": 854150142.0, + "step": 22390 + }, + { + "epoch": 2.848365347920112, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.675443410873413, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8801944255828857, + "num_tokens": 854186058.0, + "step": 22391 + }, + { + "epoch": 2.8484925581987026, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8228042125701904, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8835434913635254, + "num_tokens": 854217448.0, + "step": 22392 + }, + { + "epoch": 2.8486197684772927, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5967276096343994, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8579351902008057, + "num_tokens": 854261847.0, + "step": 22393 + }, + { + "epoch": 2.8487469787558837, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6705611944198608, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8731692433357239, + "num_tokens": 854300368.0, + "step": 22394 + }, + { + "epoch": 2.8488741890344738, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.593931794166565, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8686284422874451, + "num_tokens": 854341059.0, + "step": 22395 + }, + { + "epoch": 2.8490013993130647, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6887454986572266, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8564426898956299, + "num_tokens": 854379555.0, + "step": 22396 + }, + { + "epoch": 2.849128609591655, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6002241373062134, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8769844770431519, + "num_tokens": 854415832.0, + "step": 22397 + }, + { + "epoch": 2.849255819870246, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6058984994888306, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8827558755874634, + "num_tokens": 854453448.0, + "step": 22398 + }, + { + "epoch": 2.849383030148836, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6579515933990479, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8747161626815796, + "num_tokens": 854488887.0, + "step": 22399 + }, + { + "epoch": 2.8495102404274264, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6264302730560303, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8681814670562744, + "num_tokens": 854528619.0, + "step": 22400 + }, + { + "epoch": 2.849637450706017, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5494190454483032, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8665399551391602, + "num_tokens": 854572694.0, + "step": 22401 + }, + { + "epoch": 2.8497646609846075, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.8189826011657715, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8752493262290955, + "num_tokens": 854604357.0, + "step": 22402 + }, + { + "epoch": 2.849891871263198, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.600101351737976, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8701633810997009, + "num_tokens": 854647534.0, + "step": 22403 + }, + { + "epoch": 2.8500190815417885, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5589520931243896, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8789365291595459, + "num_tokens": 854690591.0, + "step": 22404 + }, + { + "epoch": 2.850146291820379, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.556870937347412, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.878899097442627, + "num_tokens": 854732305.0, + "step": 22405 + }, + { + "epoch": 2.8502735020989696, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7287249565124512, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8729320168495178, + "num_tokens": 854766679.0, + "step": 22406 + }, + { + "epoch": 2.85040071237756, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8272384405136108, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8564252853393555, + "num_tokens": 854804231.0, + "step": 22407 + }, + { + "epoch": 2.8505279226561506, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5990797281265259, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.866374135017395, + "num_tokens": 854844824.0, + "step": 22408 + }, + { + "epoch": 2.850655132934741, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6183972358703613, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8663672804832458, + "num_tokens": 854886748.0, + "step": 22409 + }, + { + "epoch": 2.8507823432133317, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6072555780410767, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8608973026275635, + "num_tokens": 854930203.0, + "step": 22410 + }, + { + "epoch": 2.850909553491922, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.481317400932312, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8840718269348145, + "num_tokens": 854971439.0, + "step": 22411 + }, + { + "epoch": 2.8510367637705127, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6237008571624756, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8787868618965149, + "num_tokens": 855006818.0, + "step": 22412 + }, + { + "epoch": 2.8511639740491033, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7654447555541992, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8751907348632812, + "num_tokens": 855039214.0, + "step": 22413 + }, + { + "epoch": 2.851291184327694, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7139242887496948, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8746563196182251, + "num_tokens": 855075774.0, + "step": 22414 + }, + { + "epoch": 2.8514183946062843, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.550437569618225, + "learning_rate": 1e-06, + "loss": 0.347, + "mean_token_accuracy": 0.8893032670021057, + "num_tokens": 855116085.0, + "step": 22415 + }, + { + "epoch": 2.851545604884875, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.642311453819275, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8773049116134644, + "num_tokens": 855151858.0, + "step": 22416 + }, + { + "epoch": 2.8516728151634654, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4920111894607544, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8809805512428284, + "num_tokens": 855196138.0, + "step": 22417 + }, + { + "epoch": 2.8518000254420555, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6340105533599854, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8873224258422852, + "num_tokens": 855230860.0, + "step": 22418 + }, + { + "epoch": 2.8519272357206464, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.74517023563385, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8756297826766968, + "num_tokens": 855263972.0, + "step": 22419 + }, + { + "epoch": 2.8520544459992365, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5726488828659058, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8686349391937256, + "num_tokens": 855305035.0, + "step": 22420 + }, + { + "epoch": 2.8521816562778275, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5820562839508057, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8665853142738342, + "num_tokens": 855343210.0, + "step": 22421 + }, + { + "epoch": 2.8523088665564176, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5902974605560303, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8687134385108948, + "num_tokens": 855382647.0, + "step": 22422 + }, + { + "epoch": 2.852436076835008, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7043771743774414, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8756776452064514, + "num_tokens": 855420839.0, + "step": 22423 + }, + { + "epoch": 2.8525632871135986, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6594167947769165, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8729016780853271, + "num_tokens": 855458389.0, + "step": 22424 + }, + { + "epoch": 2.852690497392189, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.738673448562622, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8829019665718079, + "num_tokens": 855492929.0, + "step": 22425 + }, + { + "epoch": 2.8528177076707797, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6197296380996704, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.879016637802124, + "num_tokens": 855530588.0, + "step": 22426 + }, + { + "epoch": 2.85294491794937, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.618330955505371, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.859110951423645, + "num_tokens": 855572739.0, + "step": 22427 + }, + { + "epoch": 2.8530721282279607, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5850058794021606, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8660458326339722, + "num_tokens": 855615773.0, + "step": 22428 + }, + { + "epoch": 2.8531993385065513, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6282036304473877, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8865234851837158, + "num_tokens": 855653001.0, + "step": 22429 + }, + { + "epoch": 2.853326548785142, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.600334882736206, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8735280632972717, + "num_tokens": 855693637.0, + "step": 22430 + }, + { + "epoch": 2.8534537590637323, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.43971848487854, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8840200901031494, + "num_tokens": 855736123.0, + "step": 22431 + }, + { + "epoch": 2.853580969342323, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6375652551651, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8790673613548279, + "num_tokens": 855770844.0, + "step": 22432 + }, + { + "epoch": 2.8537081796209134, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.599387288093567, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8796011209487915, + "num_tokens": 855808235.0, + "step": 22433 + }, + { + "epoch": 2.853835389899504, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5438185930252075, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8866491317749023, + "num_tokens": 855848755.0, + "step": 22434 + }, + { + "epoch": 2.8539626001780944, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6963914632797241, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8620331883430481, + "num_tokens": 855886828.0, + "step": 22435 + }, + { + "epoch": 2.854089810456685, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6438387632369995, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8633257746696472, + "num_tokens": 855926992.0, + "step": 22436 + }, + { + "epoch": 2.8542170207352755, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6825284957885742, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8825002908706665, + "num_tokens": 855962477.0, + "step": 22437 + }, + { + "epoch": 2.854344231013866, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7099653482437134, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.868573009967804, + "num_tokens": 856001833.0, + "step": 22438 + }, + { + "epoch": 2.8544714412924566, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7566478252410889, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8722202181816101, + "num_tokens": 856038044.0, + "step": 22439 + }, + { + "epoch": 2.854598651571047, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7635314464569092, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8784353733062744, + "num_tokens": 856072993.0, + "step": 22440 + }, + { + "epoch": 2.8547258618496376, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7646821737289429, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8547362089157104, + "num_tokens": 856112417.0, + "step": 22441 + }, + { + "epoch": 2.854853072128228, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6414717435836792, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8620965480804443, + "num_tokens": 856154785.0, + "step": 22442 + }, + { + "epoch": 2.8549802824068182, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6137624979019165, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8761940598487854, + "num_tokens": 856192705.0, + "step": 22443 + }, + { + "epoch": 2.855107492685409, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5665782690048218, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8851088285446167, + "num_tokens": 856230168.0, + "step": 22444 + }, + { + "epoch": 2.8552347029639993, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7195810079574585, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8699025511741638, + "num_tokens": 856273064.0, + "step": 22445 + }, + { + "epoch": 2.8553619132425903, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5131051540374756, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8827581405639648, + "num_tokens": 856313020.0, + "step": 22446 + }, + { + "epoch": 2.8554891235211803, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5987577438354492, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8738904595375061, + "num_tokens": 856353765.0, + "step": 22447 + }, + { + "epoch": 2.855616333799771, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6676212549209595, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8824598789215088, + "num_tokens": 856390472.0, + "step": 22448 + }, + { + "epoch": 2.8557435440783614, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.56235933303833, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8791195154190063, + "num_tokens": 856429718.0, + "step": 22449 + }, + { + "epoch": 2.855870754356952, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6482046842575073, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8616853952407837, + "num_tokens": 856472344.0, + "step": 22450 + }, + { + "epoch": 2.8559979646355425, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5137690305709839, + "learning_rate": 1e-06, + "loss": 0.3146, + "mean_token_accuracy": 0.8960384130477905, + "num_tokens": 856509017.0, + "step": 22451 + }, + { + "epoch": 2.856125174914133, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8779305219650269, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8771079778671265, + "num_tokens": 856541134.0, + "step": 22452 + }, + { + "epoch": 2.8562523851927235, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7347878217697144, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8851464986801147, + "num_tokens": 856577055.0, + "step": 22453 + }, + { + "epoch": 2.856379595471314, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6463724374771118, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.87882000207901, + "num_tokens": 856614111.0, + "step": 22454 + }, + { + "epoch": 2.8565068057499046, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6035149097442627, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8742288947105408, + "num_tokens": 856652874.0, + "step": 22455 + }, + { + "epoch": 2.856634016028495, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6750677824020386, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8771266937255859, + "num_tokens": 856686246.0, + "step": 22456 + }, + { + "epoch": 2.8567612263070856, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6269782781600952, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.850975751876831, + "num_tokens": 856728481.0, + "step": 22457 + }, + { + "epoch": 2.856888436585676, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.626092791557312, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8729720115661621, + "num_tokens": 856765770.0, + "step": 22458 + }, + { + "epoch": 2.8570156468642667, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6478946208953857, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.882759690284729, + "num_tokens": 856801477.0, + "step": 22459 + }, + { + "epoch": 2.857142857142857, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5232155323028564, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.874329686164856, + "num_tokens": 856846851.0, + "step": 22460 + }, + { + "epoch": 2.8572700674214477, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4871236085891724, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8821040987968445, + "num_tokens": 856887823.0, + "step": 22461 + }, + { + "epoch": 2.8573972777000383, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6590690612792969, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.866547703742981, + "num_tokens": 856927373.0, + "step": 22462 + }, + { + "epoch": 2.857524487978629, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6129882335662842, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8807321786880493, + "num_tokens": 856966910.0, + "step": 22463 + }, + { + "epoch": 2.8576516982572193, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.716808795928955, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.874135434627533, + "num_tokens": 857003599.0, + "step": 22464 + }, + { + "epoch": 2.85777890853581, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6999952793121338, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8624851703643799, + "num_tokens": 857044617.0, + "step": 22465 + }, + { + "epoch": 2.8579061188144, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 2.2892377376556396, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8778253197669983, + "num_tokens": 857084697.0, + "step": 22466 + }, + { + "epoch": 2.858033329092991, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6936475038528442, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.879115104675293, + "num_tokens": 857123019.0, + "step": 22467 + }, + { + "epoch": 2.858160539371581, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.602502465248108, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.879845380783081, + "num_tokens": 857163260.0, + "step": 22468 + }, + { + "epoch": 2.858287749650172, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5945202112197876, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8832976818084717, + "num_tokens": 857202126.0, + "step": 22469 + }, + { + "epoch": 2.858414959928762, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5368304252624512, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8755785226821899, + "num_tokens": 857242440.0, + "step": 22470 + }, + { + "epoch": 2.858542170207353, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5485693216323853, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8709279298782349, + "num_tokens": 857284595.0, + "step": 22471 + }, + { + "epoch": 2.858669380485943, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6561999320983887, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8803413510322571, + "num_tokens": 857320900.0, + "step": 22472 + }, + { + "epoch": 2.8587965907645336, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5931739807128906, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8634968400001526, + "num_tokens": 857361781.0, + "step": 22473 + }, + { + "epoch": 2.858923801043124, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5556280612945557, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8605548739433289, + "num_tokens": 857405218.0, + "step": 22474 + }, + { + "epoch": 2.8590510113217147, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6689949035644531, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8728277683258057, + "num_tokens": 857443809.0, + "step": 22475 + }, + { + "epoch": 2.859178221600305, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6291162967681885, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8620715737342834, + "num_tokens": 857483100.0, + "step": 22476 + }, + { + "epoch": 2.8593054318788957, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.567765235900879, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.881738543510437, + "num_tokens": 857524056.0, + "step": 22477 + }, + { + "epoch": 2.8594326421574863, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.558846116065979, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8901416063308716, + "num_tokens": 857560337.0, + "step": 22478 + }, + { + "epoch": 2.859559852436077, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5820108652114868, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8826243877410889, + "num_tokens": 857596118.0, + "step": 22479 + }, + { + "epoch": 2.8596870627146673, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5773425102233887, + "learning_rate": 1e-06, + "loss": 0.3615, + "mean_token_accuracy": 0.8839153051376343, + "num_tokens": 857635031.0, + "step": 22480 + }, + { + "epoch": 2.859814272993258, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7073379755020142, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8689257502555847, + "num_tokens": 857673507.0, + "step": 22481 + }, + { + "epoch": 2.8599414832718484, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.710116982460022, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8648790717124939, + "num_tokens": 857711622.0, + "step": 22482 + }, + { + "epoch": 2.860068693550439, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6322170495986938, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8765937089920044, + "num_tokens": 857749204.0, + "step": 22483 + }, + { + "epoch": 2.8601959038290294, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6310698986053467, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8678600788116455, + "num_tokens": 857790929.0, + "step": 22484 + }, + { + "epoch": 2.86032311410762, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6568222045898438, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8870878219604492, + "num_tokens": 857822531.0, + "step": 22485 + }, + { + "epoch": 2.8604503243862105, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6037169694900513, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8851655125617981, + "num_tokens": 857862034.0, + "step": 22486 + }, + { + "epoch": 2.860577534664801, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5371061563491821, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8805176019668579, + "num_tokens": 857902439.0, + "step": 22487 + }, + { + "epoch": 2.8607047449433916, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8435819149017334, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8724724054336548, + "num_tokens": 857935760.0, + "step": 22488 + }, + { + "epoch": 2.860831955221982, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6712613105773926, + "learning_rate": 1e-06, + "loss": 0.3372, + "mean_token_accuracy": 0.8919291496276855, + "num_tokens": 857972891.0, + "step": 22489 + }, + { + "epoch": 2.8609591655005726, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7764447927474976, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.854877233505249, + "num_tokens": 858008924.0, + "step": 22490 + }, + { + "epoch": 2.8610863757791627, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6399623155593872, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8763654232025146, + "num_tokens": 858050225.0, + "step": 22491 + }, + { + "epoch": 2.8612135860577537, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.665847897529602, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.853886604309082, + "num_tokens": 858093609.0, + "step": 22492 + }, + { + "epoch": 2.8613407963363438, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6243095397949219, + "learning_rate": 1e-06, + "loss": 0.3572, + "mean_token_accuracy": 0.8859671950340271, + "num_tokens": 858134293.0, + "step": 22493 + }, + { + "epoch": 2.8614680066149347, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7151199579238892, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8748711347579956, + "num_tokens": 858170650.0, + "step": 22494 + }, + { + "epoch": 2.861595216893525, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6691346168518066, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8736987113952637, + "num_tokens": 858213308.0, + "step": 22495 + }, + { + "epoch": 2.861722427172116, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6766568422317505, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8689771890640259, + "num_tokens": 858251741.0, + "step": 22496 + }, + { + "epoch": 2.861849637450706, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7035664319992065, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.867448091506958, + "num_tokens": 858290040.0, + "step": 22497 + }, + { + "epoch": 2.8619768477292964, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7699087858200073, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8827645778656006, + "num_tokens": 858320659.0, + "step": 22498 + }, + { + "epoch": 2.862104058007887, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.657301902770996, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8788912296295166, + "num_tokens": 858358186.0, + "step": 22499 + }, + { + "epoch": 2.8622312682864774, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5562602281570435, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8667294383049011, + "num_tokens": 858399456.0, + "step": 22500 + }, + { + "epoch": 2.862358478565068, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.67598295211792, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8627253770828247, + "num_tokens": 858441991.0, + "step": 22501 + }, + { + "epoch": 2.8624856888436585, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.668716311454773, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8829387426376343, + "num_tokens": 858479822.0, + "step": 22502 + }, + { + "epoch": 2.862612899122249, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.582504153251648, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8635637760162354, + "num_tokens": 858521168.0, + "step": 22503 + }, + { + "epoch": 2.8627401094008396, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6038318872451782, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8715062737464905, + "num_tokens": 858561035.0, + "step": 22504 + }, + { + "epoch": 2.86286731967943, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5519022941589355, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8663532137870789, + "num_tokens": 858607252.0, + "step": 22505 + }, + { + "epoch": 2.8629945299580206, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5773011445999146, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8833631277084351, + "num_tokens": 858645401.0, + "step": 22506 + }, + { + "epoch": 2.863121740236611, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6073251962661743, + "learning_rate": 1e-06, + "loss": 0.3601, + "mean_token_accuracy": 0.8808911442756653, + "num_tokens": 858684935.0, + "step": 22507 + }, + { + "epoch": 2.8632489505152017, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5583727359771729, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8783512115478516, + "num_tokens": 858726838.0, + "step": 22508 + }, + { + "epoch": 2.863376160793792, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.648419737815857, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8692218065261841, + "num_tokens": 858767260.0, + "step": 22509 + }, + { + "epoch": 2.8635033710723827, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5492178201675415, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8827155828475952, + "num_tokens": 858806572.0, + "step": 22510 + }, + { + "epoch": 2.8636305813509733, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5972951650619507, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8654625415802002, + "num_tokens": 858847630.0, + "step": 22511 + }, + { + "epoch": 2.863757791629564, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5567132234573364, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8970063328742981, + "num_tokens": 858883926.0, + "step": 22512 + }, + { + "epoch": 2.8638850019081543, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.692713975906372, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8599229454994202, + "num_tokens": 858926924.0, + "step": 22513 + }, + { + "epoch": 2.864012212186745, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7319674491882324, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8731384873390198, + "num_tokens": 858962609.0, + "step": 22514 + }, + { + "epoch": 2.8641394224653354, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9898697137832642, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8690763711929321, + "num_tokens": 858997520.0, + "step": 22515 + }, + { + "epoch": 2.8642666327439255, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6457619667053223, + "learning_rate": 1e-06, + "loss": 0.3619, + "mean_token_accuracy": 0.8838880062103271, + "num_tokens": 859033495.0, + "step": 22516 + }, + { + "epoch": 2.8643938430225164, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.597006916999817, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8806477785110474, + "num_tokens": 859068969.0, + "step": 22517 + }, + { + "epoch": 2.8645210533011065, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6591174602508545, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8657208681106567, + "num_tokens": 859106443.0, + "step": 22518 + }, + { + "epoch": 2.8646482635796975, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5733171701431274, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8654507398605347, + "num_tokens": 859149813.0, + "step": 22519 + }, + { + "epoch": 2.8647754738582876, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6532236337661743, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8746505379676819, + "num_tokens": 859192135.0, + "step": 22520 + }, + { + "epoch": 2.864902684136878, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6858195066452026, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8774934411048889, + "num_tokens": 859230349.0, + "step": 22521 + }, + { + "epoch": 2.8650298944154686, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5726040601730347, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8715914487838745, + "num_tokens": 859267694.0, + "step": 22522 + }, + { + "epoch": 2.865157104694059, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6177774667739868, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8850921392440796, + "num_tokens": 859306479.0, + "step": 22523 + }, + { + "epoch": 2.8652843149726497, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5537787675857544, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8737291693687439, + "num_tokens": 859349569.0, + "step": 22524 + }, + { + "epoch": 2.86541152525124, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6714450120925903, + "learning_rate": 1e-06, + "loss": 0.345, + "mean_token_accuracy": 0.8873631358146667, + "num_tokens": 859387300.0, + "step": 22525 + }, + { + "epoch": 2.8655387355298307, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7074342966079712, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8695886731147766, + "num_tokens": 859424882.0, + "step": 22526 + }, + { + "epoch": 2.8656659458084213, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5437439680099487, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8824759721755981, + "num_tokens": 859462943.0, + "step": 22527 + }, + { + "epoch": 2.865793156087012, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5534989833831787, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8735528588294983, + "num_tokens": 859503944.0, + "step": 22528 + }, + { + "epoch": 2.8659203663656023, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4876371622085571, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8754875659942627, + "num_tokens": 859551270.0, + "step": 22529 + }, + { + "epoch": 2.866047576644193, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7140032052993774, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8745900988578796, + "num_tokens": 859584604.0, + "step": 22530 + }, + { + "epoch": 2.8661747869227834, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5519587993621826, + "learning_rate": 1e-06, + "loss": 0.3394, + "mean_token_accuracy": 0.8887426853179932, + "num_tokens": 859621161.0, + "step": 22531 + }, + { + "epoch": 2.866301997201374, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5768744945526123, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8772905468940735, + "num_tokens": 859661509.0, + "step": 22532 + }, + { + "epoch": 2.8664292074799644, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 3.7818422317504883, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8763355016708374, + "num_tokens": 859695490.0, + "step": 22533 + }, + { + "epoch": 2.866556417758555, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7793387174606323, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8632423281669617, + "num_tokens": 859729873.0, + "step": 22534 + }, + { + "epoch": 2.8666836280371455, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6999212503433228, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8784157037734985, + "num_tokens": 859765769.0, + "step": 22535 + }, + { + "epoch": 2.866810838315736, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6730730533599854, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8736420273780823, + "num_tokens": 859799886.0, + "step": 22536 + }, + { + "epoch": 2.8669380485943265, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5808897018432617, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8842589855194092, + "num_tokens": 859839060.0, + "step": 22537 + }, + { + "epoch": 2.867065258872917, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6423671245574951, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8608330488204956, + "num_tokens": 859880670.0, + "step": 22538 + }, + { + "epoch": 2.8671924691515076, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6647098064422607, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8689515590667725, + "num_tokens": 859919584.0, + "step": 22539 + }, + { + "epoch": 2.867319679430098, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6085232496261597, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.87432461977005, + "num_tokens": 859962073.0, + "step": 22540 + }, + { + "epoch": 2.867446889708688, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5753495693206787, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8760918974876404, + "num_tokens": 860003495.0, + "step": 22541 + }, + { + "epoch": 2.867574099987279, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6602712869644165, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.851070761680603, + "num_tokens": 860042035.0, + "step": 22542 + }, + { + "epoch": 2.8677013102658693, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5239452123641968, + "learning_rate": 1e-06, + "loss": 0.3331, + "mean_token_accuracy": 0.8915882110595703, + "num_tokens": 860081591.0, + "step": 22543 + }, + { + "epoch": 2.8678285205444602, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7618579864501953, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8818120956420898, + "num_tokens": 860112795.0, + "step": 22544 + }, + { + "epoch": 2.8679557308230503, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6700061559677124, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8690296411514282, + "num_tokens": 860149031.0, + "step": 22545 + }, + { + "epoch": 2.868082941101641, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6850903034210205, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8713263273239136, + "num_tokens": 860186816.0, + "step": 22546 + }, + { + "epoch": 2.8682101513802314, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6206772327423096, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.870881199836731, + "num_tokens": 860227288.0, + "step": 22547 + }, + { + "epoch": 2.868337361658822, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7401059865951538, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8761069178581238, + "num_tokens": 860263179.0, + "step": 22548 + }, + { + "epoch": 2.8684645719374124, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7765291929244995, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8792048692703247, + "num_tokens": 860296655.0, + "step": 22549 + }, + { + "epoch": 2.868591782216003, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7360225915908813, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8760056495666504, + "num_tokens": 860328387.0, + "step": 22550 + }, + { + "epoch": 2.8687189924945935, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.686211347579956, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8693109750747681, + "num_tokens": 860364558.0, + "step": 22551 + }, + { + "epoch": 2.868846202773184, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6328177452087402, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8533613681793213, + "num_tokens": 860409370.0, + "step": 22552 + }, + { + "epoch": 2.8689734130517746, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6600844860076904, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8766337633132935, + "num_tokens": 860443988.0, + "step": 22553 + }, + { + "epoch": 2.869100623330365, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4925485849380493, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8866419792175293, + "num_tokens": 860486646.0, + "step": 22554 + }, + { + "epoch": 2.8692278336089556, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.3754475116729736, + "learning_rate": 1e-06, + "loss": 0.3126, + "mean_token_accuracy": 0.9000421166419983, + "num_tokens": 860533239.0, + "step": 22555 + }, + { + "epoch": 2.869355043887546, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5117305517196655, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8779144287109375, + "num_tokens": 860572325.0, + "step": 22556 + }, + { + "epoch": 2.8694822541661367, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.606084942817688, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8713933825492859, + "num_tokens": 860613580.0, + "step": 22557 + }, + { + "epoch": 2.869609464444727, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7175772190093994, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8765435814857483, + "num_tokens": 860647916.0, + "step": 22558 + }, + { + "epoch": 2.8697366747233177, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7459896802902222, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.8833668231964111, + "num_tokens": 860676836.0, + "step": 22559 + }, + { + "epoch": 2.8698638850019083, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5528643131256104, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.876761794090271, + "num_tokens": 860717794.0, + "step": 22560 + }, + { + "epoch": 2.869991095280499, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7517083883285522, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8672424554824829, + "num_tokens": 860754498.0, + "step": 22561 + }, + { + "epoch": 2.8701183055590893, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7001768350601196, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8824254870414734, + "num_tokens": 860788288.0, + "step": 22562 + }, + { + "epoch": 2.87024551583768, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6938575506210327, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8831315040588379, + "num_tokens": 860824340.0, + "step": 22563 + }, + { + "epoch": 2.87037272611627, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7228046655654907, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.881689727306366, + "num_tokens": 860855562.0, + "step": 22564 + }, + { + "epoch": 2.870499936394861, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5596117973327637, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8857044577598572, + "num_tokens": 860897693.0, + "step": 22565 + }, + { + "epoch": 2.870627146673451, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6827080249786377, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.881973385810852, + "num_tokens": 860933805.0, + "step": 22566 + }, + { + "epoch": 2.870754356952042, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5659886598587036, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8733586072921753, + "num_tokens": 860975752.0, + "step": 22567 + }, + { + "epoch": 2.870881567230632, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5897784233093262, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8768031001091003, + "num_tokens": 861013038.0, + "step": 22568 + }, + { + "epoch": 2.871008777509223, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7275497913360596, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.853211522102356, + "num_tokens": 861051590.0, + "step": 22569 + }, + { + "epoch": 2.871135987787813, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6852712631225586, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8817681074142456, + "num_tokens": 861090056.0, + "step": 22570 + }, + { + "epoch": 2.8712631980664036, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7548314332962036, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8703460693359375, + "num_tokens": 861127686.0, + "step": 22571 + }, + { + "epoch": 2.871390408344994, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5674885511398315, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.882744312286377, + "num_tokens": 861169309.0, + "step": 22572 + }, + { + "epoch": 2.8715176186235847, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6313306093215942, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8706047534942627, + "num_tokens": 861208782.0, + "step": 22573 + }, + { + "epoch": 2.871644828902175, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7133467197418213, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.854010820388794, + "num_tokens": 861245777.0, + "step": 22574 + }, + { + "epoch": 2.8717720391807657, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.525875449180603, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8752046227455139, + "num_tokens": 861290047.0, + "step": 22575 + }, + { + "epoch": 2.8718992494593563, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6559988260269165, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8726276755332947, + "num_tokens": 861326101.0, + "step": 22576 + }, + { + "epoch": 2.872026459737947, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.631341576576233, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8765815496444702, + "num_tokens": 861361771.0, + "step": 22577 + }, + { + "epoch": 2.8721536700165373, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5537811517715454, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8720757961273193, + "num_tokens": 861404288.0, + "step": 22578 + }, + { + "epoch": 2.872280880295128, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.608747959136963, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8853776454925537, + "num_tokens": 861440272.0, + "step": 22579 + }, + { + "epoch": 2.8724080905737184, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5356906652450562, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8751227259635925, + "num_tokens": 861484130.0, + "step": 22580 + }, + { + "epoch": 2.872535300852309, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.614586353302002, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8826783895492554, + "num_tokens": 861520775.0, + "step": 22581 + }, + { + "epoch": 2.8726625111308994, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.671791434288025, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8709253072738647, + "num_tokens": 861557064.0, + "step": 22582 + }, + { + "epoch": 2.87278972140949, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.54294753074646, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8794331550598145, + "num_tokens": 861601776.0, + "step": 22583 + }, + { + "epoch": 2.8729169316880805, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5711613893508911, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8880636692047119, + "num_tokens": 861639022.0, + "step": 22584 + }, + { + "epoch": 2.873044141966671, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6122393608093262, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8728516101837158, + "num_tokens": 861679056.0, + "step": 22585 + }, + { + "epoch": 2.8731713522452615, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.3959827423095703, + "learning_rate": 1e-06, + "loss": 0.3286, + "mean_token_accuracy": 0.8935890793800354, + "num_tokens": 861722113.0, + "step": 22586 + }, + { + "epoch": 2.873298562523852, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.701600432395935, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8653832674026489, + "num_tokens": 861760909.0, + "step": 22587 + }, + { + "epoch": 2.8734257728024426, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7020272016525269, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8678928017616272, + "num_tokens": 861802262.0, + "step": 22588 + }, + { + "epoch": 2.8735529830810327, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5847127437591553, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8763644695281982, + "num_tokens": 861839958.0, + "step": 22589 + }, + { + "epoch": 2.8736801933596237, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6684521436691284, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8746362924575806, + "num_tokens": 861880238.0, + "step": 22590 + }, + { + "epoch": 2.8738074036382137, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 3.7576749324798584, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8807498812675476, + "num_tokens": 861914965.0, + "step": 22591 + }, + { + "epoch": 2.8739346139168047, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4973514080047607, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8718268871307373, + "num_tokens": 861963546.0, + "step": 22592 + }, + { + "epoch": 2.874061824195395, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6011186838150024, + "learning_rate": 1e-06, + "loss": 0.3365, + "mean_token_accuracy": 0.891249418258667, + "num_tokens": 862001198.0, + "step": 22593 + }, + { + "epoch": 2.8741890344739858, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7344279289245605, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8687122464179993, + "num_tokens": 862037508.0, + "step": 22594 + }, + { + "epoch": 2.874316244752576, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.526003122329712, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.866367757320404, + "num_tokens": 862082412.0, + "step": 22595 + }, + { + "epoch": 2.8744434550311664, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6931267976760864, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.874071478843689, + "num_tokens": 862117141.0, + "step": 22596 + }, + { + "epoch": 2.874570665309757, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6044342517852783, + "learning_rate": 1e-06, + "loss": 0.3066, + "mean_token_accuracy": 0.8993313908576965, + "num_tokens": 862150059.0, + "step": 22597 + }, + { + "epoch": 2.8746978755883474, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5457251071929932, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8753470182418823, + "num_tokens": 862191935.0, + "step": 22598 + }, + { + "epoch": 2.874825085866938, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7667793035507202, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8659117221832275, + "num_tokens": 862228197.0, + "step": 22599 + }, + { + "epoch": 2.8749522961455285, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.615681767463684, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.877268373966217, + "num_tokens": 862267319.0, + "step": 22600 + }, + { + "epoch": 2.875079506424119, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5391919612884521, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.871422290802002, + "num_tokens": 862306634.0, + "step": 22601 + }, + { + "epoch": 2.8752067167027096, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5944989919662476, + "learning_rate": 1e-06, + "loss": 0.3427, + "mean_token_accuracy": 0.8893530368804932, + "num_tokens": 862340824.0, + "step": 22602 + }, + { + "epoch": 2.8753339269813, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6276237964630127, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8825823068618774, + "num_tokens": 862379770.0, + "step": 22603 + }, + { + "epoch": 2.8754611372598906, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5391225814819336, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8796414136886597, + "num_tokens": 862422624.0, + "step": 22604 + }, + { + "epoch": 2.875588347538481, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6097609996795654, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8701823949813843, + "num_tokens": 862462780.0, + "step": 22605 + }, + { + "epoch": 2.8757155578170717, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4991732835769653, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8670297265052795, + "num_tokens": 862511310.0, + "step": 22606 + }, + { + "epoch": 2.875842768095662, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6356899738311768, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8852952718734741, + "num_tokens": 862550719.0, + "step": 22607 + }, + { + "epoch": 2.8759699783742527, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6999471187591553, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8798561096191406, + "num_tokens": 862584476.0, + "step": 22608 + }, + { + "epoch": 2.8760971886528433, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.778825283050537, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8545148968696594, + "num_tokens": 862621476.0, + "step": 22609 + }, + { + "epoch": 2.876224398931434, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.964195728302002, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8561503291130066, + "num_tokens": 862654876.0, + "step": 22610 + }, + { + "epoch": 2.8763516092100243, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7838152647018433, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.876103937625885, + "num_tokens": 862692712.0, + "step": 22611 + }, + { + "epoch": 2.876478819488615, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4656707048416138, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.878772497177124, + "num_tokens": 862735740.0, + "step": 22612 + }, + { + "epoch": 2.8766060297672054, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5037466287612915, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.872246265411377, + "num_tokens": 862779883.0, + "step": 22613 + }, + { + "epoch": 2.8767332400457954, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4358489513397217, + "learning_rate": 1e-06, + "loss": 0.3419, + "mean_token_accuracy": 0.8888213038444519, + "num_tokens": 862820370.0, + "step": 22614 + }, + { + "epoch": 2.8768604503243864, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7878899574279785, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8649008274078369, + "num_tokens": 862858312.0, + "step": 22615 + }, + { + "epoch": 2.8769876606029765, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.77936851978302, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8748074769973755, + "num_tokens": 862889674.0, + "step": 22616 + }, + { + "epoch": 2.8771148708815675, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6142152547836304, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8696047067642212, + "num_tokens": 862931702.0, + "step": 22617 + }, + { + "epoch": 2.8772420811601576, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7017734050750732, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8646306395530701, + "num_tokens": 862969969.0, + "step": 22618 + }, + { + "epoch": 2.877369291438748, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.515700340270996, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8854148387908936, + "num_tokens": 863010251.0, + "step": 22619 + }, + { + "epoch": 2.8774965017173386, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5973010063171387, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8900331854820251, + "num_tokens": 863047721.0, + "step": 22620 + }, + { + "epoch": 2.877623711995929, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8178036212921143, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8730162382125854, + "num_tokens": 863082687.0, + "step": 22621 + }, + { + "epoch": 2.8777509222745197, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6170378923416138, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8781756162643433, + "num_tokens": 863122460.0, + "step": 22622 + }, + { + "epoch": 2.87787813255311, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6737496852874756, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8864642977714539, + "num_tokens": 863156432.0, + "step": 22623 + }, + { + "epoch": 2.8780053428317007, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7347198724746704, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8670710325241089, + "num_tokens": 863191438.0, + "step": 22624 + }, + { + "epoch": 2.8781325531102913, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.578678846359253, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8773596286773682, + "num_tokens": 863227815.0, + "step": 22625 + }, + { + "epoch": 2.878259763388882, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6082148551940918, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8749317526817322, + "num_tokens": 863266191.0, + "step": 22626 + }, + { + "epoch": 2.8783869736674723, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5877293348312378, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8772420883178711, + "num_tokens": 863308198.0, + "step": 22627 + }, + { + "epoch": 2.878514183946063, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6035702228546143, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8704613447189331, + "num_tokens": 863347574.0, + "step": 22628 + }, + { + "epoch": 2.8786413942246534, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.607469081878662, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8725993037223816, + "num_tokens": 863385237.0, + "step": 22629 + }, + { + "epoch": 2.878768604503244, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5860356092453003, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8707255125045776, + "num_tokens": 863428860.0, + "step": 22630 + }, + { + "epoch": 2.8788958147818344, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5683095455169678, + "learning_rate": 1e-06, + "loss": 0.3194, + "mean_token_accuracy": 0.8943266272544861, + "num_tokens": 863463943.0, + "step": 22631 + }, + { + "epoch": 2.879023025060425, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.72967529296875, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8589823246002197, + "num_tokens": 863498445.0, + "step": 22632 + }, + { + "epoch": 2.8791502353390155, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4630191326141357, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8834432363510132, + "num_tokens": 863542495.0, + "step": 22633 + }, + { + "epoch": 2.879277445617606, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5852092504501343, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.8851807117462158, + "num_tokens": 863580864.0, + "step": 22634 + }, + { + "epoch": 2.8794046558961965, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6480103731155396, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8544281125068665, + "num_tokens": 863620897.0, + "step": 22635 + }, + { + "epoch": 2.879531866174787, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6127309799194336, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8832619190216064, + "num_tokens": 863655770.0, + "step": 22636 + }, + { + "epoch": 2.8796590764533776, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6141878366470337, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8728248476982117, + "num_tokens": 863697110.0, + "step": 22637 + }, + { + "epoch": 2.879786286731968, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5019820928573608, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8801616430282593, + "num_tokens": 863740131.0, + "step": 22638 + }, + { + "epoch": 2.879913497010558, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5867825746536255, + "learning_rate": 1e-06, + "loss": 0.3382, + "mean_token_accuracy": 0.8913215398788452, + "num_tokens": 863777617.0, + "step": 22639 + }, + { + "epoch": 2.880040707289149, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6185640096664429, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8744772672653198, + "num_tokens": 863817521.0, + "step": 22640 + }, + { + "epoch": 2.8801679175677393, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6960641145706177, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8728313446044922, + "num_tokens": 863852492.0, + "step": 22641 + }, + { + "epoch": 2.8802951278463302, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6454800367355347, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.885812520980835, + "num_tokens": 863886789.0, + "step": 22642 + }, + { + "epoch": 2.8804223381249203, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6309221982955933, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8569789528846741, + "num_tokens": 863928754.0, + "step": 22643 + }, + { + "epoch": 2.880549548403511, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5709952116012573, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8743946552276611, + "num_tokens": 863971312.0, + "step": 22644 + }, + { + "epoch": 2.8806767586821014, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6084527969360352, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8795689344406128, + "num_tokens": 864012306.0, + "step": 22645 + }, + { + "epoch": 2.880803968960692, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5859180688858032, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.876541256904602, + "num_tokens": 864053281.0, + "step": 22646 + }, + { + "epoch": 2.8809311792392824, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5892692804336548, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8857381343841553, + "num_tokens": 864092830.0, + "step": 22647 + }, + { + "epoch": 2.881058389517873, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5899288654327393, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8730558156967163, + "num_tokens": 864130186.0, + "step": 22648 + }, + { + "epoch": 2.8811855997964635, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4805837869644165, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.880424439907074, + "num_tokens": 864175037.0, + "step": 22649 + }, + { + "epoch": 2.881312810075054, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7141790390014648, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8746240139007568, + "num_tokens": 864208540.0, + "step": 22650 + }, + { + "epoch": 2.8814400203536445, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.674216628074646, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8480375409126282, + "num_tokens": 864252361.0, + "step": 22651 + }, + { + "epoch": 2.881567230632235, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6327496767044067, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8694196343421936, + "num_tokens": 864292316.0, + "step": 22652 + }, + { + "epoch": 2.8816944409108256, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5317254066467285, + "learning_rate": 1e-06, + "loss": 0.3375, + "mean_token_accuracy": 0.889437198638916, + "num_tokens": 864331442.0, + "step": 22653 + }, + { + "epoch": 2.881821651189416, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.493323802947998, + "learning_rate": 1e-06, + "loss": 0.3496, + "mean_token_accuracy": 0.8853573799133301, + "num_tokens": 864371374.0, + "step": 22654 + }, + { + "epoch": 2.8819488614680067, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6364409923553467, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.868043839931488, + "num_tokens": 864406760.0, + "step": 22655 + }, + { + "epoch": 2.882076071746597, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7132395505905151, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8818420171737671, + "num_tokens": 864439398.0, + "step": 22656 + }, + { + "epoch": 2.8822032820251877, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6701927185058594, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8657468557357788, + "num_tokens": 864475989.0, + "step": 22657 + }, + { + "epoch": 2.8823304923037782, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5053735971450806, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8726364374160767, + "num_tokens": 864520341.0, + "step": 22658 + }, + { + "epoch": 2.8824577025823688, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6573607921600342, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8628851175308228, + "num_tokens": 864559940.0, + "step": 22659 + }, + { + "epoch": 2.8825849128609593, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6235158443450928, + "learning_rate": 1e-06, + "loss": 0.3298, + "mean_token_accuracy": 0.8936858773231506, + "num_tokens": 864597587.0, + "step": 22660 + }, + { + "epoch": 2.88271212313955, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6934617757797241, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8613461256027222, + "num_tokens": 864638717.0, + "step": 22661 + }, + { + "epoch": 2.88283933341814, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.683905005455017, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8685842752456665, + "num_tokens": 864676680.0, + "step": 22662 + }, + { + "epoch": 2.882966543696731, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.624709963798523, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8737537264823914, + "num_tokens": 864714501.0, + "step": 22663 + }, + { + "epoch": 2.883093753975321, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.617344856262207, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8633877038955688, + "num_tokens": 864754393.0, + "step": 22664 + }, + { + "epoch": 2.883220964253912, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5087292194366455, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8806405663490295, + "num_tokens": 864797379.0, + "step": 22665 + }, + { + "epoch": 2.883348174532502, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5310256481170654, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8831285238265991, + "num_tokens": 864839330.0, + "step": 22666 + }, + { + "epoch": 2.883475384811093, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6702653169631958, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8682072162628174, + "num_tokens": 864877668.0, + "step": 22667 + }, + { + "epoch": 2.883602595089683, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6304359436035156, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8844425678253174, + "num_tokens": 864916578.0, + "step": 22668 + }, + { + "epoch": 2.8837298053682736, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.522674560546875, + "learning_rate": 1e-06, + "loss": 0.3228, + "mean_token_accuracy": 0.8948569297790527, + "num_tokens": 864954151.0, + "step": 22669 + }, + { + "epoch": 2.883857015646864, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4909807443618774, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8822047710418701, + "num_tokens": 864994931.0, + "step": 22670 + }, + { + "epoch": 2.8839842259254547, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.730122447013855, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8891456127166748, + "num_tokens": 865030073.0, + "step": 22671 + }, + { + "epoch": 2.884111436204045, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6873785257339478, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8687951564788818, + "num_tokens": 865064915.0, + "step": 22672 + }, + { + "epoch": 2.8842386464826357, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.561794400215149, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8934471011161804, + "num_tokens": 865100102.0, + "step": 22673 + }, + { + "epoch": 2.8843658567612263, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6043362617492676, + "learning_rate": 1e-06, + "loss": 0.3566, + "mean_token_accuracy": 0.8805670738220215, + "num_tokens": 865138333.0, + "step": 22674 + }, + { + "epoch": 2.884493067039817, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7794864177703857, + "learning_rate": 1e-06, + "loss": 0.3493, + "mean_token_accuracy": 0.8858655095100403, + "num_tokens": 865173259.0, + "step": 22675 + }, + { + "epoch": 2.8846202773184073, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.607999563217163, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.875996470451355, + "num_tokens": 865209344.0, + "step": 22676 + }, + { + "epoch": 2.884747487596998, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4209579229354858, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8863589763641357, + "num_tokens": 865255333.0, + "step": 22677 + }, + { + "epoch": 2.8848746978755884, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5141658782958984, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8770315647125244, + "num_tokens": 865297521.0, + "step": 22678 + }, + { + "epoch": 2.885001908154179, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7596246004104614, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8660074472427368, + "num_tokens": 865333046.0, + "step": 22679 + }, + { + "epoch": 2.8851291184327694, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.512643814086914, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8754290342330933, + "num_tokens": 865378468.0, + "step": 22680 + }, + { + "epoch": 2.88525632871136, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6955490112304688, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8797327280044556, + "num_tokens": 865411795.0, + "step": 22681 + }, + { + "epoch": 2.8853835389899505, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6919828653335571, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8706881403923035, + "num_tokens": 865446734.0, + "step": 22682 + }, + { + "epoch": 2.885510749268541, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.671553611755371, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8583456873893738, + "num_tokens": 865489497.0, + "step": 22683 + }, + { + "epoch": 2.8856379595471315, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5446617603302002, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.887938916683197, + "num_tokens": 865528336.0, + "step": 22684 + }, + { + "epoch": 2.885765169825722, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6967421770095825, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8828818798065186, + "num_tokens": 865564008.0, + "step": 22685 + }, + { + "epoch": 2.8858923801043126, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7329151630401611, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8530370593070984, + "num_tokens": 865599728.0, + "step": 22686 + }, + { + "epoch": 2.8860195903829027, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7223774194717407, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8791295886039734, + "num_tokens": 865634147.0, + "step": 22687 + }, + { + "epoch": 2.8861468006614936, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6188859939575195, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8627815842628479, + "num_tokens": 865675979.0, + "step": 22688 + }, + { + "epoch": 2.8862740109400837, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6290887594223022, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8836759328842163, + "num_tokens": 865713618.0, + "step": 22689 + }, + { + "epoch": 2.8864012212186747, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8504914045333862, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8870671987533569, + "num_tokens": 865741490.0, + "step": 22690 + }, + { + "epoch": 2.886528431497265, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.631726861000061, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8821534514427185, + "num_tokens": 865775750.0, + "step": 22691 + }, + { + "epoch": 2.8866556417758558, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5895977020263672, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8697597980499268, + "num_tokens": 865817156.0, + "step": 22692 + }, + { + "epoch": 2.886782852054446, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.531018614768982, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8856359720230103, + "num_tokens": 865858024.0, + "step": 22693 + }, + { + "epoch": 2.8869100623330364, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6528689861297607, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8728528618812561, + "num_tokens": 865897344.0, + "step": 22694 + }, + { + "epoch": 2.887037272611627, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6984206438064575, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.87872713804245, + "num_tokens": 865934367.0, + "step": 22695 + }, + { + "epoch": 2.8871644828902174, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5819463729858398, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8739877939224243, + "num_tokens": 865976091.0, + "step": 22696 + }, + { + "epoch": 2.887291693168808, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6879093647003174, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8638995885848999, + "num_tokens": 866018313.0, + "step": 22697 + }, + { + "epoch": 2.8874189034473985, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5036635398864746, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8616827130317688, + "num_tokens": 866065077.0, + "step": 22698 + }, + { + "epoch": 2.887546113725989, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6118907928466797, + "learning_rate": 1e-06, + "loss": 0.3205, + "mean_token_accuracy": 0.8941212892532349, + "num_tokens": 866098350.0, + "step": 22699 + }, + { + "epoch": 2.8876733240045795, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6055541038513184, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8720977306365967, + "num_tokens": 866137416.0, + "step": 22700 + }, + { + "epoch": 2.88780053428317, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.610949993133545, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8791748285293579, + "num_tokens": 866177311.0, + "step": 22701 + }, + { + "epoch": 2.8879277445617606, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7342087030410767, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8512449860572815, + "num_tokens": 866216466.0, + "step": 22702 + }, + { + "epoch": 2.888054954840351, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.772242546081543, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8616703748703003, + "num_tokens": 866253857.0, + "step": 22703 + }, + { + "epoch": 2.8881821651189417, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.58440101146698, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8835103511810303, + "num_tokens": 866289585.0, + "step": 22704 + }, + { + "epoch": 2.888309375397532, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5362963676452637, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8725723028182983, + "num_tokens": 866327774.0, + "step": 22705 + }, + { + "epoch": 2.8884365856761227, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5158021450042725, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8718975782394409, + "num_tokens": 866371879.0, + "step": 22706 + }, + { + "epoch": 2.8885637959547132, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6626670360565186, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8525005578994751, + "num_tokens": 866414732.0, + "step": 22707 + }, + { + "epoch": 2.8886910062333038, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.630313754081726, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8687375783920288, + "num_tokens": 866454173.0, + "step": 22708 + }, + { + "epoch": 2.8888182165118943, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6047693490982056, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8787627816200256, + "num_tokens": 866492075.0, + "step": 22709 + }, + { + "epoch": 2.888945426790485, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6549991369247437, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.870789647102356, + "num_tokens": 866529847.0, + "step": 22710 + }, + { + "epoch": 2.8890726370690754, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6056469678878784, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8759382367134094, + "num_tokens": 866567790.0, + "step": 22711 + }, + { + "epoch": 2.8891998473476654, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7343956232070923, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.860975980758667, + "num_tokens": 866604802.0, + "step": 22712 + }, + { + "epoch": 2.8893270576262564, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5644803047180176, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8619966506958008, + "num_tokens": 866652124.0, + "step": 22713 + }, + { + "epoch": 2.8894542679048465, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.710328459739685, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8764663934707642, + "num_tokens": 866687036.0, + "step": 22714 + }, + { + "epoch": 2.8895814781834375, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7314271926879883, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8858821392059326, + "num_tokens": 866718533.0, + "step": 22715 + }, + { + "epoch": 2.8897086884620276, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.743196725845337, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8726816177368164, + "num_tokens": 866750166.0, + "step": 22716 + }, + { + "epoch": 2.889835898740618, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7243794202804565, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8699426651000977, + "num_tokens": 866782589.0, + "step": 22717 + }, + { + "epoch": 2.8899631090192086, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6400233507156372, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8596140742301941, + "num_tokens": 866823113.0, + "step": 22718 + }, + { + "epoch": 2.890090319297799, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9637500047683716, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.873946487903595, + "num_tokens": 866865704.0, + "step": 22719 + }, + { + "epoch": 2.8902175295763897, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.645958423614502, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8735626935958862, + "num_tokens": 866901259.0, + "step": 22720 + }, + { + "epoch": 2.89034473985498, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4799752235412598, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8878281116485596, + "num_tokens": 866941032.0, + "step": 22721 + }, + { + "epoch": 2.8904719501335707, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.656407356262207, + "learning_rate": 1e-06, + "loss": 0.325, + "mean_token_accuracy": 0.8951753377914429, + "num_tokens": 866977266.0, + "step": 22722 + }, + { + "epoch": 2.8905991604121613, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7049784660339355, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8828719854354858, + "num_tokens": 867011991.0, + "step": 22723 + }, + { + "epoch": 2.890726370690752, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5786296129226685, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8744208812713623, + "num_tokens": 867052108.0, + "step": 22724 + }, + { + "epoch": 2.8908535809693423, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.731227159500122, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8733959794044495, + "num_tokens": 867088523.0, + "step": 22725 + }, + { + "epoch": 2.890980791247933, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6858022212982178, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8717183470726013, + "num_tokens": 867126446.0, + "step": 22726 + }, + { + "epoch": 2.8911080015265234, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8852003812789917, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8525108098983765, + "num_tokens": 867159776.0, + "step": 22727 + }, + { + "epoch": 2.891235211805114, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5516619682312012, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.878571629524231, + "num_tokens": 867199887.0, + "step": 22728 + }, + { + "epoch": 2.8913624220837044, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.550026297569275, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8802627921104431, + "num_tokens": 867242101.0, + "step": 22729 + }, + { + "epoch": 2.891489632362295, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4908417463302612, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8843465447425842, + "num_tokens": 867282099.0, + "step": 22730 + }, + { + "epoch": 2.8916168426408855, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7059131860733032, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8830593824386597, + "num_tokens": 867316551.0, + "step": 22731 + }, + { + "epoch": 2.891744052919476, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6036510467529297, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8782037496566772, + "num_tokens": 867354225.0, + "step": 22732 + }, + { + "epoch": 2.8918712631980665, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.645115613937378, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8765853643417358, + "num_tokens": 867394762.0, + "step": 22733 + }, + { + "epoch": 2.891998473476657, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6904884576797485, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8704631328582764, + "num_tokens": 867429157.0, + "step": 22734 + }, + { + "epoch": 2.8921256837552476, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5852590799331665, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8881677389144897, + "num_tokens": 867464697.0, + "step": 22735 + }, + { + "epoch": 2.892252894033838, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5905126333236694, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8654170036315918, + "num_tokens": 867508888.0, + "step": 22736 + }, + { + "epoch": 2.892380104312428, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5799446105957031, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8881105184555054, + "num_tokens": 867544679.0, + "step": 22737 + }, + { + "epoch": 2.892507314591019, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7015609741210938, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8702830672264099, + "num_tokens": 867578410.0, + "step": 22738 + }, + { + "epoch": 2.8926345248696093, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5967328548431396, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8785516023635864, + "num_tokens": 867616182.0, + "step": 22739 + }, + { + "epoch": 2.8927617351482002, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.464063286781311, + "learning_rate": 1e-06, + "loss": 0.3255, + "mean_token_accuracy": 0.8956084251403809, + "num_tokens": 867653963.0, + "step": 22740 + }, + { + "epoch": 2.8928889454267903, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5106589794158936, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8886040449142456, + "num_tokens": 867691746.0, + "step": 22741 + }, + { + "epoch": 2.893016155705381, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6440186500549316, + "learning_rate": 1e-06, + "loss": 0.3565, + "mean_token_accuracy": 0.885839581489563, + "num_tokens": 867728734.0, + "step": 22742 + }, + { + "epoch": 2.8931433659839714, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6150633096694946, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8752905130386353, + "num_tokens": 867765424.0, + "step": 22743 + }, + { + "epoch": 2.893270576262562, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7646549940109253, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.874326229095459, + "num_tokens": 867796779.0, + "step": 22744 + }, + { + "epoch": 2.8933977865411524, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6353943347930908, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8764002323150635, + "num_tokens": 867837859.0, + "step": 22745 + }, + { + "epoch": 2.893524996819743, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6701804399490356, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8487337827682495, + "num_tokens": 867877653.0, + "step": 22746 + }, + { + "epoch": 2.8936522070983335, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4490063190460205, + "learning_rate": 1e-06, + "loss": 0.3346, + "mean_token_accuracy": 0.8903182744979858, + "num_tokens": 867920580.0, + "step": 22747 + }, + { + "epoch": 2.893779417376924, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5932286977767944, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8682659864425659, + "num_tokens": 867961553.0, + "step": 22748 + }, + { + "epoch": 2.8939066276555145, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5746335983276367, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8810960650444031, + "num_tokens": 868001573.0, + "step": 22749 + }, + { + "epoch": 2.894033837934105, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7315324544906616, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8647158741950989, + "num_tokens": 868038283.0, + "step": 22750 + }, + { + "epoch": 2.8941610482126956, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5262857675552368, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.870092511177063, + "num_tokens": 868080094.0, + "step": 22751 + }, + { + "epoch": 2.894288258491286, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7500427961349487, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8731153011322021, + "num_tokens": 868115577.0, + "step": 22752 + }, + { + "epoch": 2.8944154687698767, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.62031090259552, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8674216270446777, + "num_tokens": 868154058.0, + "step": 22753 + }, + { + "epoch": 2.894542679048467, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6901938915252686, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8725382089614868, + "num_tokens": 868192263.0, + "step": 22754 + }, + { + "epoch": 2.8946698893270577, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6568466424942017, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8884224891662598, + "num_tokens": 868227094.0, + "step": 22755 + }, + { + "epoch": 2.8947970996056482, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.608336329460144, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.880439281463623, + "num_tokens": 868264567.0, + "step": 22756 + }, + { + "epoch": 2.8949243098842388, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6877437829971313, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8821334838867188, + "num_tokens": 868297049.0, + "step": 22757 + }, + { + "epoch": 2.8950515201628293, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.631058931350708, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8688516616821289, + "num_tokens": 868336114.0, + "step": 22758 + }, + { + "epoch": 2.89517873044142, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6775413751602173, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8675007820129395, + "num_tokens": 868369468.0, + "step": 22759 + }, + { + "epoch": 2.89530594072001, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6861492395401, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8702937364578247, + "num_tokens": 868406284.0, + "step": 22760 + }, + { + "epoch": 2.895433150998601, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5619699954986572, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8754116892814636, + "num_tokens": 868444709.0, + "step": 22761 + }, + { + "epoch": 2.895560361277191, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.681119441986084, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8684327006340027, + "num_tokens": 868483662.0, + "step": 22762 + }, + { + "epoch": 2.895687571555782, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4775822162628174, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.865715742111206, + "num_tokens": 868528937.0, + "step": 22763 + }, + { + "epoch": 2.895814781834372, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6592772006988525, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8883271217346191, + "num_tokens": 868563774.0, + "step": 22764 + }, + { + "epoch": 2.895941992112963, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6234735250473022, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8731755614280701, + "num_tokens": 868602967.0, + "step": 22765 + }, + { + "epoch": 2.896069202391553, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.562270164489746, + "learning_rate": 1e-06, + "loss": 0.3598, + "mean_token_accuracy": 0.8807650804519653, + "num_tokens": 868643043.0, + "step": 22766 + }, + { + "epoch": 2.8961964126701436, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.565285325050354, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.881524920463562, + "num_tokens": 868681397.0, + "step": 22767 + }, + { + "epoch": 2.896323622948734, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5899665355682373, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8746573328971863, + "num_tokens": 868721691.0, + "step": 22768 + }, + { + "epoch": 2.8964508332273247, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6055569648742676, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8731868267059326, + "num_tokens": 868764570.0, + "step": 22769 + }, + { + "epoch": 2.896578043505915, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.603968858718872, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8840494155883789, + "num_tokens": 868801489.0, + "step": 22770 + }, + { + "epoch": 2.8967052537845057, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5992892980575562, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8691587448120117, + "num_tokens": 868841557.0, + "step": 22771 + }, + { + "epoch": 2.8968324640630962, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5883272886276245, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8689056634902954, + "num_tokens": 868879870.0, + "step": 22772 + }, + { + "epoch": 2.8969596743416868, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5975382328033447, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8791142702102661, + "num_tokens": 868914710.0, + "step": 22773 + }, + { + "epoch": 2.8970868846202773, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6406915187835693, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.885527491569519, + "num_tokens": 868949817.0, + "step": 22774 + }, + { + "epoch": 2.897214094898868, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4267538785934448, + "learning_rate": 1e-06, + "loss": 0.3185, + "mean_token_accuracy": 0.8988900780677795, + "num_tokens": 868988082.0, + "step": 22775 + }, + { + "epoch": 2.8973413051774584, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6590323448181152, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8718876242637634, + "num_tokens": 869022901.0, + "step": 22776 + }, + { + "epoch": 2.897468515456049, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6898282766342163, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.867436945438385, + "num_tokens": 869057061.0, + "step": 22777 + }, + { + "epoch": 2.8975957257346394, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7089591026306152, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8838523030281067, + "num_tokens": 869094424.0, + "step": 22778 + }, + { + "epoch": 2.89772293601323, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.471271276473999, + "learning_rate": 1e-06, + "loss": 0.3439, + "mean_token_accuracy": 0.8867924213409424, + "num_tokens": 869134148.0, + "step": 22779 + }, + { + "epoch": 2.8978501462918205, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5713255405426025, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8678401112556458, + "num_tokens": 869174640.0, + "step": 22780 + }, + { + "epoch": 2.897977356570411, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5988177061080933, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8724279403686523, + "num_tokens": 869215725.0, + "step": 22781 + }, + { + "epoch": 2.8981045668490015, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6118433475494385, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8683652877807617, + "num_tokens": 869252965.0, + "step": 22782 + }, + { + "epoch": 2.898231777127592, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6504054069519043, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8765822052955627, + "num_tokens": 869286671.0, + "step": 22783 + }, + { + "epoch": 2.8983589874061826, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5995299816131592, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8720347285270691, + "num_tokens": 869325661.0, + "step": 22784 + }, + { + "epoch": 2.8984861976847727, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6956583261489868, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8626190423965454, + "num_tokens": 869362108.0, + "step": 22785 + }, + { + "epoch": 2.8986134079633636, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5807366371154785, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8782106637954712, + "num_tokens": 869400656.0, + "step": 22786 + }, + { + "epoch": 2.8987406182419537, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7219597101211548, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8645045161247253, + "num_tokens": 869436773.0, + "step": 22787 + }, + { + "epoch": 2.8988678285205447, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7224574089050293, + "learning_rate": 1e-06, + "loss": 0.3179, + "mean_token_accuracy": 0.8964210748672485, + "num_tokens": 869469208.0, + "step": 22788 + }, + { + "epoch": 2.898995038799135, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5743952989578247, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8850434422492981, + "num_tokens": 869507343.0, + "step": 22789 + }, + { + "epoch": 2.8991222490777258, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.442152500152588, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8874027132987976, + "num_tokens": 869549310.0, + "step": 22790 + }, + { + "epoch": 2.899249459356316, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.602637529373169, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8729541301727295, + "num_tokens": 869589584.0, + "step": 22791 + }, + { + "epoch": 2.8993766696349064, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.612683653831482, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8686870336532593, + "num_tokens": 869629962.0, + "step": 22792 + }, + { + "epoch": 2.899503879913497, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5874032974243164, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.887903094291687, + "num_tokens": 869671919.0, + "step": 22793 + }, + { + "epoch": 2.8996310901920874, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.695322871208191, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8705742359161377, + "num_tokens": 869714998.0, + "step": 22794 + }, + { + "epoch": 2.899758300470678, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6135777235031128, + "learning_rate": 1e-06, + "loss": 0.3573, + "mean_token_accuracy": 0.8824372291564941, + "num_tokens": 869752246.0, + "step": 22795 + }, + { + "epoch": 2.8998855107492685, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.957590103149414, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8440507650375366, + "num_tokens": 869788982.0, + "step": 22796 + }, + { + "epoch": 2.900012721027859, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6549639701843262, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8852812051773071, + "num_tokens": 869825918.0, + "step": 22797 + }, + { + "epoch": 2.9001399313064495, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8440438508987427, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8734850883483887, + "num_tokens": 869864557.0, + "step": 22798 + }, + { + "epoch": 2.90026714158504, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5582329034805298, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8624914884567261, + "num_tokens": 869904908.0, + "step": 22799 + }, + { + "epoch": 2.9003943518636306, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6773202419281006, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8723568916320801, + "num_tokens": 869940652.0, + "step": 22800 + }, + { + "epoch": 2.900521562142221, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6871293783187866, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8829690217971802, + "num_tokens": 869980910.0, + "step": 22801 + }, + { + "epoch": 2.9006487724208116, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6306706666946411, + "learning_rate": 1e-06, + "loss": 0.3463, + "mean_token_accuracy": 0.8859999179840088, + "num_tokens": 870017703.0, + "step": 22802 + }, + { + "epoch": 2.900775982699402, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6822782754898071, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8581755757331848, + "num_tokens": 870056813.0, + "step": 22803 + }, + { + "epoch": 2.9009031929779927, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.706921935081482, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8648938536643982, + "num_tokens": 870093601.0, + "step": 22804 + }, + { + "epoch": 2.9010304032565832, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6474895477294922, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8717705607414246, + "num_tokens": 870134913.0, + "step": 22805 + }, + { + "epoch": 2.9011576135351738, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5344947576522827, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8683089017868042, + "num_tokens": 870180876.0, + "step": 22806 + }, + { + "epoch": 2.9012848238137643, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5283467769622803, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8749882578849792, + "num_tokens": 870222445.0, + "step": 22807 + }, + { + "epoch": 2.901412034092355, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5562368631362915, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8763146996498108, + "num_tokens": 870262647.0, + "step": 22808 + }, + { + "epoch": 2.9015392443709453, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7018113136291504, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.866154134273529, + "num_tokens": 870298679.0, + "step": 22809 + }, + { + "epoch": 2.9016664546495354, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6919969320297241, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8730994462966919, + "num_tokens": 870336276.0, + "step": 22810 + }, + { + "epoch": 2.9017936649281264, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6111122369766235, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8843832612037659, + "num_tokens": 870373653.0, + "step": 22811 + }, + { + "epoch": 2.9019208752067165, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7456941604614258, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8628392219543457, + "num_tokens": 870412015.0, + "step": 22812 + }, + { + "epoch": 2.9020480854853075, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.787398338317871, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8550411462783813, + "num_tokens": 870445613.0, + "step": 22813 + }, + { + "epoch": 2.9021752957638975, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5775171518325806, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8747401833534241, + "num_tokens": 870486782.0, + "step": 22814 + }, + { + "epoch": 2.902302506042488, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.841322660446167, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8706748485565186, + "num_tokens": 870522411.0, + "step": 22815 + }, + { + "epoch": 2.9024297163210786, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5843273401260376, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.868930459022522, + "num_tokens": 870563164.0, + "step": 22816 + }, + { + "epoch": 2.902556926599669, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7438879013061523, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8767414689064026, + "num_tokens": 870599585.0, + "step": 22817 + }, + { + "epoch": 2.9026841368782597, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7391095161437988, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8783084154129028, + "num_tokens": 870634162.0, + "step": 22818 + }, + { + "epoch": 2.90281134715685, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 2.0368900299072266, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8619736433029175, + "num_tokens": 870662668.0, + "step": 22819 + }, + { + "epoch": 2.9029385574354407, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6771475076675415, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8706687688827515, + "num_tokens": 870696940.0, + "step": 22820 + }, + { + "epoch": 2.9030657677140312, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.8865896463394165, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8553235530853271, + "num_tokens": 870732097.0, + "step": 22821 + }, + { + "epoch": 2.9031929779926218, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6713541746139526, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8588854074478149, + "num_tokens": 870775566.0, + "step": 22822 + }, + { + "epoch": 2.9033201882712123, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6240991353988647, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8819152116775513, + "num_tokens": 870812225.0, + "step": 22823 + }, + { + "epoch": 2.903447398549803, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.699623465538025, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8736129999160767, + "num_tokens": 870845704.0, + "step": 22824 + }, + { + "epoch": 2.9035746088283934, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6756842136383057, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8679826855659485, + "num_tokens": 870885449.0, + "step": 22825 + }, + { + "epoch": 2.903701819106984, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.662020206451416, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8575035333633423, + "num_tokens": 870926088.0, + "step": 22826 + }, + { + "epoch": 2.9038290293855744, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7236592769622803, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8678411245346069, + "num_tokens": 870966367.0, + "step": 22827 + }, + { + "epoch": 2.903956239664165, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8161870241165161, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8738336563110352, + "num_tokens": 870997498.0, + "step": 22828 + }, + { + "epoch": 2.9040834499427555, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5992895364761353, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8776334524154663, + "num_tokens": 871035186.0, + "step": 22829 + }, + { + "epoch": 2.904210660221346, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6214194297790527, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8876330852508545, + "num_tokens": 871069859.0, + "step": 22830 + }, + { + "epoch": 2.9043378704999365, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6295583248138428, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8734225034713745, + "num_tokens": 871113399.0, + "step": 22831 + }, + { + "epoch": 2.904465080778527, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5047271251678467, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8909167051315308, + "num_tokens": 871150754.0, + "step": 22832 + }, + { + "epoch": 2.9045922910571176, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7576406002044678, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8665809035301208, + "num_tokens": 871185653.0, + "step": 22833 + }, + { + "epoch": 2.904719501335708, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5849705934524536, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8723273277282715, + "num_tokens": 871225336.0, + "step": 22834 + }, + { + "epoch": 2.904846711614298, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7588887214660645, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8711085319519043, + "num_tokens": 871261408.0, + "step": 22835 + }, + { + "epoch": 2.904973921892889, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.587438702583313, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8598902821540833, + "num_tokens": 871307124.0, + "step": 22836 + }, + { + "epoch": 2.9051011321714793, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6204123497009277, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8763478994369507, + "num_tokens": 871343953.0, + "step": 22837 + }, + { + "epoch": 2.9052283424500702, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6078364849090576, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.875354528427124, + "num_tokens": 871386793.0, + "step": 22838 + }, + { + "epoch": 2.9053555527286603, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.776897668838501, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.872861385345459, + "num_tokens": 871421362.0, + "step": 22839 + }, + { + "epoch": 2.905482763007251, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.577847957611084, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8717509508132935, + "num_tokens": 871464180.0, + "step": 22840 + }, + { + "epoch": 2.9056099732858414, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7177783250808716, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.870132565498352, + "num_tokens": 871499606.0, + "step": 22841 + }, + { + "epoch": 2.905737183564432, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6793063879013062, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8575200438499451, + "num_tokens": 871536674.0, + "step": 22842 + }, + { + "epoch": 2.9058643938430224, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6824027299880981, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8840506076812744, + "num_tokens": 871577489.0, + "step": 22843 + }, + { + "epoch": 2.905991604121613, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7553675174713135, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8752439022064209, + "num_tokens": 871612661.0, + "step": 22844 + }, + { + "epoch": 2.9061188144002035, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6885144710540771, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.877011775970459, + "num_tokens": 871646525.0, + "step": 22845 + }, + { + "epoch": 2.906246024678794, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6262447834014893, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.858985185623169, + "num_tokens": 871686402.0, + "step": 22846 + }, + { + "epoch": 2.9063732349573845, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.629189133644104, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8861021995544434, + "num_tokens": 871721900.0, + "step": 22847 + }, + { + "epoch": 2.906500445235975, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5548055171966553, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8798967003822327, + "num_tokens": 871766058.0, + "step": 22848 + }, + { + "epoch": 2.9066276555145656, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.665378451347351, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8839057087898254, + "num_tokens": 871801191.0, + "step": 22849 + }, + { + "epoch": 2.906754865793156, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7583818435668945, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8765692710876465, + "num_tokens": 871835043.0, + "step": 22850 + }, + { + "epoch": 2.9068820760717466, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.696195125579834, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8624082803726196, + "num_tokens": 871871584.0, + "step": 22851 + }, + { + "epoch": 2.907009286350337, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.82846200466156, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8867744207382202, + "num_tokens": 871904522.0, + "step": 22852 + }, + { + "epoch": 2.9071364966289277, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6442829370498657, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8909980058670044, + "num_tokens": 871945872.0, + "step": 22853 + }, + { + "epoch": 2.9072637069075182, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7255531549453735, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8747807741165161, + "num_tokens": 871985160.0, + "step": 22854 + }, + { + "epoch": 2.9073909171861088, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6349799633026123, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8498321771621704, + "num_tokens": 872029104.0, + "step": 22855 + }, + { + "epoch": 2.9075181274646993, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6006004810333252, + "learning_rate": 1e-06, + "loss": 0.3494, + "mean_token_accuracy": 0.8898122310638428, + "num_tokens": 872071126.0, + "step": 22856 + }, + { + "epoch": 2.90764533774329, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7074264287948608, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8680276870727539, + "num_tokens": 872110524.0, + "step": 22857 + }, + { + "epoch": 2.90777254802188, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6027569770812988, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8808712363243103, + "num_tokens": 872153065.0, + "step": 22858 + }, + { + "epoch": 2.907899758300471, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.647026538848877, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8768342733383179, + "num_tokens": 872192498.0, + "step": 22859 + }, + { + "epoch": 2.908026968579061, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.652662992477417, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8687418103218079, + "num_tokens": 872230760.0, + "step": 22860 + }, + { + "epoch": 2.908154178857652, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8422118425369263, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.865350604057312, + "num_tokens": 872268308.0, + "step": 22861 + }, + { + "epoch": 2.908281389136242, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.595845103263855, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.860575258731842, + "num_tokens": 872304652.0, + "step": 22862 + }, + { + "epoch": 2.908408599414833, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4950213432312012, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8802752494812012, + "num_tokens": 872348966.0, + "step": 22863 + }, + { + "epoch": 2.908535809693423, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5752971172332764, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8746047616004944, + "num_tokens": 872390034.0, + "step": 22864 + }, + { + "epoch": 2.9086630199720136, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6177304983139038, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8768860697746277, + "num_tokens": 872425856.0, + "step": 22865 + }, + { + "epoch": 2.908790230250604, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.671292781829834, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8730498552322388, + "num_tokens": 872464185.0, + "step": 22866 + }, + { + "epoch": 2.9089174405291947, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5127463340759277, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8929159641265869, + "num_tokens": 872501418.0, + "step": 22867 + }, + { + "epoch": 2.909044650807785, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6145853996276855, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8666248321533203, + "num_tokens": 872542015.0, + "step": 22868 + }, + { + "epoch": 2.9091718610863757, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6571669578552246, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8558384776115417, + "num_tokens": 872580884.0, + "step": 22869 + }, + { + "epoch": 2.9092990713649662, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.561248779296875, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8729205131530762, + "num_tokens": 872616253.0, + "step": 22870 + }, + { + "epoch": 2.9094262816435568, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6157795190811157, + "learning_rate": 1e-06, + "loss": 0.3275, + "mean_token_accuracy": 0.8938640356063843, + "num_tokens": 872650871.0, + "step": 22871 + }, + { + "epoch": 2.9095534919221473, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.679023265838623, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.876068115234375, + "num_tokens": 872688108.0, + "step": 22872 + }, + { + "epoch": 2.909680702200738, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8301533460617065, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8756142854690552, + "num_tokens": 872719532.0, + "step": 22873 + }, + { + "epoch": 2.9098079124793284, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.680474877357483, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8719332814216614, + "num_tokens": 872759823.0, + "step": 22874 + }, + { + "epoch": 2.909935122757919, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7071994543075562, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8644556999206543, + "num_tokens": 872796195.0, + "step": 22875 + }, + { + "epoch": 2.9100623330365094, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.555301547050476, + "learning_rate": 1e-06, + "loss": 0.3429, + "mean_token_accuracy": 0.8883435130119324, + "num_tokens": 872836754.0, + "step": 22876 + }, + { + "epoch": 2.9101895433151, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6619747877120972, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.873406171798706, + "num_tokens": 872875265.0, + "step": 22877 + }, + { + "epoch": 2.9103167535936905, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8202087879180908, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8537936210632324, + "num_tokens": 872910063.0, + "step": 22878 + }, + { + "epoch": 2.910443963872281, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5962705612182617, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8747864961624146, + "num_tokens": 872951761.0, + "step": 22879 + }, + { + "epoch": 2.9105711741508715, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6947314739227295, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8878343105316162, + "num_tokens": 872987661.0, + "step": 22880 + }, + { + "epoch": 2.910698384429462, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6078033447265625, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8804811835289001, + "num_tokens": 873024596.0, + "step": 22881 + }, + { + "epoch": 2.9108255947080526, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6129965782165527, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8795552849769592, + "num_tokens": 873061727.0, + "step": 22882 + }, + { + "epoch": 2.9109528049866427, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.786877989768982, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8739365339279175, + "num_tokens": 873094418.0, + "step": 22883 + }, + { + "epoch": 2.9110800152652336, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.730607032775879, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8391220569610596, + "num_tokens": 873137628.0, + "step": 22884 + }, + { + "epoch": 2.9112072255438237, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7379568815231323, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8806401491165161, + "num_tokens": 873170316.0, + "step": 22885 + }, + { + "epoch": 2.9113344358224147, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.674548864364624, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8811664581298828, + "num_tokens": 873212026.0, + "step": 22886 + }, + { + "epoch": 2.9114616461010048, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5633060932159424, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8849856853485107, + "num_tokens": 873253410.0, + "step": 22887 + }, + { + "epoch": 2.9115888563795957, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6812868118286133, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8742600679397583, + "num_tokens": 873288949.0, + "step": 22888 + }, + { + "epoch": 2.911716066658186, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5906827449798584, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8754435777664185, + "num_tokens": 873328247.0, + "step": 22889 + }, + { + "epoch": 2.9118432769367764, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.636042594909668, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8633068799972534, + "num_tokens": 873373761.0, + "step": 22890 + }, + { + "epoch": 2.911970487215367, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5500400066375732, + "learning_rate": 1e-06, + "loss": 0.3535, + "mean_token_accuracy": 0.884457528591156, + "num_tokens": 873413839.0, + "step": 22891 + }, + { + "epoch": 2.9120976974939574, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6768388748168945, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8738362789154053, + "num_tokens": 873448686.0, + "step": 22892 + }, + { + "epoch": 2.912224907772548, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8387621641159058, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8742817640304565, + "num_tokens": 873479668.0, + "step": 22893 + }, + { + "epoch": 2.9123521180511385, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.712024211883545, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.874018669128418, + "num_tokens": 873516546.0, + "step": 22894 + }, + { + "epoch": 2.912479328329729, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8303242921829224, + "learning_rate": 1e-06, + "loss": 0.3522, + "mean_token_accuracy": 0.8817386627197266, + "num_tokens": 873550196.0, + "step": 22895 + }, + { + "epoch": 2.9126065386083195, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7211532592773438, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8596091866493225, + "num_tokens": 873594203.0, + "step": 22896 + }, + { + "epoch": 2.91273374888691, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.685642123222351, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.877649188041687, + "num_tokens": 873630933.0, + "step": 22897 + }, + { + "epoch": 2.9128609591655006, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4603296518325806, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8727430105209351, + "num_tokens": 873678652.0, + "step": 22898 + }, + { + "epoch": 2.912988169444091, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.540018916130066, + "learning_rate": 1e-06, + "loss": 0.3362, + "mean_token_accuracy": 0.8927133083343506, + "num_tokens": 873715447.0, + "step": 22899 + }, + { + "epoch": 2.9131153797226816, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.826218605041504, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8365655541419983, + "num_tokens": 873752934.0, + "step": 22900 + }, + { + "epoch": 2.913242590001272, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5947885513305664, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8706218600273132, + "num_tokens": 873792715.0, + "step": 22901 + }, + { + "epoch": 2.9133698002798627, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.771404504776001, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8772933483123779, + "num_tokens": 873826812.0, + "step": 22902 + }, + { + "epoch": 2.9134970105584532, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6396816968917847, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8791811466217041, + "num_tokens": 873864487.0, + "step": 22903 + }, + { + "epoch": 2.9136242208370438, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6622238159179688, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8658945560455322, + "num_tokens": 873904858.0, + "step": 22904 + }, + { + "epoch": 2.9137514311156343, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7933555841445923, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.857330858707428, + "num_tokens": 873941342.0, + "step": 22905 + }, + { + "epoch": 2.913878641394225, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.650651454925537, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8792320489883423, + "num_tokens": 873979901.0, + "step": 22906 + }, + { + "epoch": 2.9140058516728153, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6441303491592407, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8571668863296509, + "num_tokens": 874022504.0, + "step": 22907 + }, + { + "epoch": 2.9141330619514054, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7674846649169922, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8736113905906677, + "num_tokens": 874057004.0, + "step": 22908 + }, + { + "epoch": 2.9142602722299964, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6775987148284912, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.88336181640625, + "num_tokens": 874088896.0, + "step": 22909 + }, + { + "epoch": 2.9143874825085865, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5174676179885864, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8792060613632202, + "num_tokens": 874130296.0, + "step": 22910 + }, + { + "epoch": 2.9145146927871775, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.606227993965149, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8729666471481323, + "num_tokens": 874171085.0, + "step": 22911 + }, + { + "epoch": 2.9146419030657675, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5488226413726807, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8713090419769287, + "num_tokens": 874215834.0, + "step": 22912 + }, + { + "epoch": 2.914769113344358, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.752032995223999, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8573590517044067, + "num_tokens": 874256731.0, + "step": 22913 + }, + { + "epoch": 2.9148963236229486, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6182892322540283, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8670548796653748, + "num_tokens": 874296908.0, + "step": 22914 + }, + { + "epoch": 2.915023533901539, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5744249820709229, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8733564615249634, + "num_tokens": 874334918.0, + "step": 22915 + }, + { + "epoch": 2.9151507441801296, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7764173746109009, + "learning_rate": 1e-06, + "loss": 0.3435, + "mean_token_accuracy": 0.8831071853637695, + "num_tokens": 874365592.0, + "step": 22916 + }, + { + "epoch": 2.91527795445872, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5276484489440918, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8823268413543701, + "num_tokens": 874407187.0, + "step": 22917 + }, + { + "epoch": 2.9154051647373107, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.532194972038269, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8860318064689636, + "num_tokens": 874446510.0, + "step": 22918 + }, + { + "epoch": 2.9155323750159012, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6033601760864258, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8833203315734863, + "num_tokens": 874481702.0, + "step": 22919 + }, + { + "epoch": 2.9156595852944918, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.477786898612976, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8773671388626099, + "num_tokens": 874526015.0, + "step": 22920 + }, + { + "epoch": 2.9157867955730823, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 2.1879048347473145, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8788948655128479, + "num_tokens": 874553188.0, + "step": 22921 + }, + { + "epoch": 2.915914005851673, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7238305807113647, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8844102621078491, + "num_tokens": 874585895.0, + "step": 22922 + }, + { + "epoch": 2.9160412161302633, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5526434183120728, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8630877137184143, + "num_tokens": 874627203.0, + "step": 22923 + }, + { + "epoch": 2.916168426408854, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8187685012817383, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8645390272140503, + "num_tokens": 874658955.0, + "step": 22924 + }, + { + "epoch": 2.9162956366874444, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.767585277557373, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8726792335510254, + "num_tokens": 874692696.0, + "step": 22925 + }, + { + "epoch": 2.916422846966035, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.620989441871643, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.867246150970459, + "num_tokens": 874732827.0, + "step": 22926 + }, + { + "epoch": 2.9165500572446255, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5790269374847412, + "learning_rate": 1e-06, + "loss": 0.3466, + "mean_token_accuracy": 0.889188289642334, + "num_tokens": 874770094.0, + "step": 22927 + }, + { + "epoch": 2.916677267523216, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5915167331695557, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8711900115013123, + "num_tokens": 874810986.0, + "step": 22928 + }, + { + "epoch": 2.9168044778018065, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.581459403038025, + "learning_rate": 1e-06, + "loss": 0.3526, + "mean_token_accuracy": 0.8838211894035339, + "num_tokens": 874848541.0, + "step": 22929 + }, + { + "epoch": 2.916931688080397, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.561293363571167, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8869690895080566, + "num_tokens": 874886851.0, + "step": 22930 + }, + { + "epoch": 2.917058898358987, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6032336950302124, + "learning_rate": 1e-06, + "loss": 0.2964, + "mean_token_accuracy": 0.9032524228096008, + "num_tokens": 874918647.0, + "step": 22931 + }, + { + "epoch": 2.917186108637578, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5896708965301514, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8713944554328918, + "num_tokens": 874958899.0, + "step": 22932 + }, + { + "epoch": 2.917313318916168, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.4822723865509033, + "learning_rate": 1e-06, + "loss": 0.323, + "mean_token_accuracy": 0.895117998123169, + "num_tokens": 874999727.0, + "step": 22933 + }, + { + "epoch": 2.917440529194759, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6236318349838257, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8802345395088196, + "num_tokens": 875038108.0, + "step": 22934 + }, + { + "epoch": 2.9175677394733492, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6095075607299805, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8849815130233765, + "num_tokens": 875076491.0, + "step": 22935 + }, + { + "epoch": 2.91769494975194, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.601806879043579, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8684309720993042, + "num_tokens": 875116690.0, + "step": 22936 + }, + { + "epoch": 2.9178221600305303, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9517625570297241, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8869242668151855, + "num_tokens": 875153694.0, + "step": 22937 + }, + { + "epoch": 2.917949370309121, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6274665594100952, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8772537112236023, + "num_tokens": 875192446.0, + "step": 22938 + }, + { + "epoch": 2.9180765805877114, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6283257007598877, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8608230352401733, + "num_tokens": 875230496.0, + "step": 22939 + }, + { + "epoch": 2.918203790866302, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7193986177444458, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8652902841567993, + "num_tokens": 875266544.0, + "step": 22940 + }, + { + "epoch": 2.9183310011448924, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5808531045913696, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8804177045822144, + "num_tokens": 875305129.0, + "step": 22941 + }, + { + "epoch": 2.918458211423483, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6712682247161865, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8661895990371704, + "num_tokens": 875343861.0, + "step": 22942 + }, + { + "epoch": 2.9185854217020735, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6433887481689453, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8758760094642639, + "num_tokens": 875384694.0, + "step": 22943 + }, + { + "epoch": 2.918712631980664, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5881181955337524, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8762094378471375, + "num_tokens": 875426520.0, + "step": 22944 + }, + { + "epoch": 2.9188398422592545, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.697393536567688, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8829633593559265, + "num_tokens": 875464791.0, + "step": 22945 + }, + { + "epoch": 2.918967052537845, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5435806512832642, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8884525299072266, + "num_tokens": 875504646.0, + "step": 22946 + }, + { + "epoch": 2.9190942628164356, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5874055624008179, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.879831850528717, + "num_tokens": 875543878.0, + "step": 22947 + }, + { + "epoch": 2.919221473095026, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6324132680892944, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8817832469940186, + "num_tokens": 875580475.0, + "step": 22948 + }, + { + "epoch": 2.9193486833736166, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6335256099700928, + "learning_rate": 1e-06, + "loss": 0.3444, + "mean_token_accuracy": 0.8882371187210083, + "num_tokens": 875615779.0, + "step": 22949 + }, + { + "epoch": 2.919475893652207, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.574758529663086, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8704595565795898, + "num_tokens": 875657261.0, + "step": 22950 + }, + { + "epoch": 2.9196031039307977, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4541656970977783, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.883392870426178, + "num_tokens": 875699667.0, + "step": 22951 + }, + { + "epoch": 2.9197303142093882, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5611642599105835, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8778098821640015, + "num_tokens": 875738438.0, + "step": 22952 + }, + { + "epoch": 2.9198575244879788, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6927822828292847, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8555062413215637, + "num_tokens": 875779326.0, + "step": 22953 + }, + { + "epoch": 2.9199847347665693, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6236001253128052, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8799262046813965, + "num_tokens": 875816352.0, + "step": 22954 + }, + { + "epoch": 2.92011194504516, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6004482507705688, + "learning_rate": 1e-06, + "loss": 0.3536, + "mean_token_accuracy": 0.8862117528915405, + "num_tokens": 875852401.0, + "step": 22955 + }, + { + "epoch": 2.92023915532375, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5823912620544434, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8759633898735046, + "num_tokens": 875890620.0, + "step": 22956 + }, + { + "epoch": 2.920366365602341, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5765712261199951, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8788295984268188, + "num_tokens": 875928861.0, + "step": 22957 + }, + { + "epoch": 2.920493575880931, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7109299898147583, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8840090036392212, + "num_tokens": 875967983.0, + "step": 22958 + }, + { + "epoch": 2.920620786159522, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.603516697883606, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8664399981498718, + "num_tokens": 876006692.0, + "step": 22959 + }, + { + "epoch": 2.920747996438112, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5835093259811401, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8741229772567749, + "num_tokens": 876047884.0, + "step": 22960 + }, + { + "epoch": 2.920875206716703, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5775327682495117, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8674447536468506, + "num_tokens": 876093472.0, + "step": 22961 + }, + { + "epoch": 2.921002416995293, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7772303819656372, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8681248426437378, + "num_tokens": 876125755.0, + "step": 22962 + }, + { + "epoch": 2.9211296272738836, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6870478391647339, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8666511178016663, + "num_tokens": 876162806.0, + "step": 22963 + }, + { + "epoch": 2.921256837552474, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5744657516479492, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8799726366996765, + "num_tokens": 876203729.0, + "step": 22964 + }, + { + "epoch": 2.9213840478310646, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6809736490249634, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.877280592918396, + "num_tokens": 876238569.0, + "step": 22965 + }, + { + "epoch": 2.921511258109655, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.665305256843567, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8746411800384521, + "num_tokens": 876275961.0, + "step": 22966 + }, + { + "epoch": 2.9216384683882457, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6474580764770508, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8817216753959656, + "num_tokens": 876315141.0, + "step": 22967 + }, + { + "epoch": 2.9217656786668362, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6311886310577393, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8789675831794739, + "num_tokens": 876353319.0, + "step": 22968 + }, + { + "epoch": 2.9218928889454268, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.662843108177185, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8607373237609863, + "num_tokens": 876389549.0, + "step": 22969 + }, + { + "epoch": 2.9220200992240173, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5736746788024902, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8787345886230469, + "num_tokens": 876430606.0, + "step": 22970 + }, + { + "epoch": 2.922147309502608, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6158937215805054, + "learning_rate": 1e-06, + "loss": 0.3239, + "mean_token_accuracy": 0.8949626684188843, + "num_tokens": 876463968.0, + "step": 22971 + }, + { + "epoch": 2.9222745197811983, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5975266695022583, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8852605819702148, + "num_tokens": 876496375.0, + "step": 22972 + }, + { + "epoch": 2.922401730059789, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.543096899986267, + "learning_rate": 1e-06, + "loss": 0.3513, + "mean_token_accuracy": 0.8881241083145142, + "num_tokens": 876532921.0, + "step": 22973 + }, + { + "epoch": 2.9225289403383794, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5669255256652832, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8697087168693542, + "num_tokens": 876573468.0, + "step": 22974 + }, + { + "epoch": 2.92265615061697, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6994845867156982, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8746424913406372, + "num_tokens": 876610734.0, + "step": 22975 + }, + { + "epoch": 2.9227833608955605, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5469313859939575, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.868279218673706, + "num_tokens": 876653450.0, + "step": 22976 + }, + { + "epoch": 2.922910571174151, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.789904236793518, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8622227311134338, + "num_tokens": 876686783.0, + "step": 22977 + }, + { + "epoch": 2.9230377814527415, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5675052404403687, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8796229362487793, + "num_tokens": 876725488.0, + "step": 22978 + }, + { + "epoch": 2.923164991731332, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6479095220565796, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8872480988502502, + "num_tokens": 876760450.0, + "step": 22979 + }, + { + "epoch": 2.9232922020099226, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4677401781082153, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8871678113937378, + "num_tokens": 876802121.0, + "step": 22980 + }, + { + "epoch": 2.9234194122885127, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5035492181777954, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8776272535324097, + "num_tokens": 876843910.0, + "step": 22981 + }, + { + "epoch": 2.9235466225671036, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7328604459762573, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8702393770217896, + "num_tokens": 876879697.0, + "step": 22982 + }, + { + "epoch": 2.9236738328456937, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6409599781036377, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8797577619552612, + "num_tokens": 876917085.0, + "step": 22983 + }, + { + "epoch": 2.9238010431242847, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.629905343055725, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8775968551635742, + "num_tokens": 876953358.0, + "step": 22984 + }, + { + "epoch": 2.9239282534028748, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6423511505126953, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8641705513000488, + "num_tokens": 876992503.0, + "step": 22985 + }, + { + "epoch": 2.9240554636814657, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8105437755584717, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8706913590431213, + "num_tokens": 877024673.0, + "step": 22986 + }, + { + "epoch": 2.924182673960056, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5862423181533813, + "learning_rate": 1e-06, + "loss": 0.3334, + "mean_token_accuracy": 0.8913544416427612, + "num_tokens": 877060813.0, + "step": 22987 + }, + { + "epoch": 2.9243098842386464, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.600706696510315, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8746432065963745, + "num_tokens": 877099011.0, + "step": 22988 + }, + { + "epoch": 2.924437094517237, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.505202054977417, + "learning_rate": 1e-06, + "loss": 0.3307, + "mean_token_accuracy": 0.8906288146972656, + "num_tokens": 877136927.0, + "step": 22989 + }, + { + "epoch": 2.9245643047958274, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5900171995162964, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8708592653274536, + "num_tokens": 877175081.0, + "step": 22990 + }, + { + "epoch": 2.924691515074418, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7004261016845703, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8663387298583984, + "num_tokens": 877209991.0, + "step": 22991 + }, + { + "epoch": 2.9248187253530085, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7780965566635132, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8650134205818176, + "num_tokens": 877244186.0, + "step": 22992 + }, + { + "epoch": 2.924945935631599, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8329654932022095, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8706256151199341, + "num_tokens": 877276353.0, + "step": 22993 + }, + { + "epoch": 2.9250731459101895, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6663177013397217, + "learning_rate": 1e-06, + "loss": 0.3385, + "mean_token_accuracy": 0.8908790349960327, + "num_tokens": 877313032.0, + "step": 22994 + }, + { + "epoch": 2.92520035618878, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4813454151153564, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8758361339569092, + "num_tokens": 877357258.0, + "step": 22995 + }, + { + "epoch": 2.9253275664673706, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.573883295059204, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.869568943977356, + "num_tokens": 877397318.0, + "step": 22996 + }, + { + "epoch": 2.925454776745961, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7760592699050903, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8576608896255493, + "num_tokens": 877434352.0, + "step": 22997 + }, + { + "epoch": 2.9255819870245516, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.537268877029419, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8831419944763184, + "num_tokens": 877475347.0, + "step": 22998 + }, + { + "epoch": 2.925709197303142, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.701025128364563, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8630599975585938, + "num_tokens": 877516676.0, + "step": 22999 + }, + { + "epoch": 2.9258364075817327, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.762332558631897, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8717721700668335, + "num_tokens": 877553501.0, + "step": 23000 + }, + { + "epoch": 2.925963617860323, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6127054691314697, + "learning_rate": 1e-06, + "loss": 0.35, + "mean_token_accuracy": 0.8878594636917114, + "num_tokens": 877590917.0, + "step": 23001 + }, + { + "epoch": 2.9260908281389137, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.621174931526184, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8733969330787659, + "num_tokens": 877626633.0, + "step": 23002 + }, + { + "epoch": 2.9262180384175043, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6815868616104126, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8541110754013062, + "num_tokens": 877667083.0, + "step": 23003 + }, + { + "epoch": 2.926345248696095, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7189618349075317, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8752921223640442, + "num_tokens": 877699562.0, + "step": 23004 + }, + { + "epoch": 2.9264724589746853, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6148686408996582, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8834552764892578, + "num_tokens": 877738797.0, + "step": 23005 + }, + { + "epoch": 2.9265996692532754, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8037320375442505, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8563085198402405, + "num_tokens": 877773215.0, + "step": 23006 + }, + { + "epoch": 2.9267268795318664, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6096813678741455, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8727719187736511, + "num_tokens": 877813762.0, + "step": 23007 + }, + { + "epoch": 2.9268540898104565, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6749521493911743, + "learning_rate": 1e-06, + "loss": 0.351, + "mean_token_accuracy": 0.8848645687103271, + "num_tokens": 877848797.0, + "step": 23008 + }, + { + "epoch": 2.9269813000890474, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6863659620285034, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8610197901725769, + "num_tokens": 877887022.0, + "step": 23009 + }, + { + "epoch": 2.9271085103676375, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5523133277893066, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8735945224761963, + "num_tokens": 877928318.0, + "step": 23010 + }, + { + "epoch": 2.927235720646228, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9150744676589966, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8591044545173645, + "num_tokens": 877961220.0, + "step": 23011 + }, + { + "epoch": 2.9273629309248186, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.690861463546753, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8604975342750549, + "num_tokens": 878002438.0, + "step": 23012 + }, + { + "epoch": 2.927490141203409, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6276025772094727, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8694514632225037, + "num_tokens": 878044664.0, + "step": 23013 + }, + { + "epoch": 2.9276173514819996, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.71173095703125, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8686181902885437, + "num_tokens": 878085932.0, + "step": 23014 + }, + { + "epoch": 2.92774456176059, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5501197576522827, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8727902173995972, + "num_tokens": 878127267.0, + "step": 23015 + }, + { + "epoch": 2.9278717720391807, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.628252625465393, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8717044591903687, + "num_tokens": 878164116.0, + "step": 23016 + }, + { + "epoch": 2.9279989823177712, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.619089126586914, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8850569725036621, + "num_tokens": 878200174.0, + "step": 23017 + }, + { + "epoch": 2.9281261925963618, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6569976806640625, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8687581419944763, + "num_tokens": 878239854.0, + "step": 23018 + }, + { + "epoch": 2.9282534028749523, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5474879741668701, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8745782375335693, + "num_tokens": 878283859.0, + "step": 23019 + }, + { + "epoch": 2.928380613153543, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7238469123840332, + "learning_rate": 1e-06, + "loss": 0.3265, + "mean_token_accuracy": 0.894446611404419, + "num_tokens": 878318653.0, + "step": 23020 + }, + { + "epoch": 2.9285078234321333, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5804554224014282, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8765597343444824, + "num_tokens": 878360616.0, + "step": 23021 + }, + { + "epoch": 2.928635033710724, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7857528924942017, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8718880414962769, + "num_tokens": 878393550.0, + "step": 23022 + }, + { + "epoch": 2.9287622439893144, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.529374599456787, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8766654133796692, + "num_tokens": 878434346.0, + "step": 23023 + }, + { + "epoch": 2.928889454267905, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6416046619415283, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8696370124816895, + "num_tokens": 878469737.0, + "step": 23024 + }, + { + "epoch": 2.9290166645464955, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5890408754348755, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8752031326293945, + "num_tokens": 878506428.0, + "step": 23025 + }, + { + "epoch": 2.929143874825086, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.506691813468933, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8825008869171143, + "num_tokens": 878547235.0, + "step": 23026 + }, + { + "epoch": 2.9292710851036765, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6661626100540161, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8769124746322632, + "num_tokens": 878587796.0, + "step": 23027 + }, + { + "epoch": 2.929398295382267, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5949485301971436, + "learning_rate": 1e-06, + "loss": 0.3663, + "mean_token_accuracy": 0.879355788230896, + "num_tokens": 878625401.0, + "step": 23028 + }, + { + "epoch": 2.929525505660857, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.557137131690979, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8822271227836609, + "num_tokens": 878662381.0, + "step": 23029 + }, + { + "epoch": 2.929652715939448, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.636149287223816, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8578600287437439, + "num_tokens": 878701705.0, + "step": 23030 + }, + { + "epoch": 2.929779926218038, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6789902448654175, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8804088830947876, + "num_tokens": 878738151.0, + "step": 23031 + }, + { + "epoch": 2.929907136496629, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8490543365478516, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8756723403930664, + "num_tokens": 878769039.0, + "step": 23032 + }, + { + "epoch": 2.9300343467752192, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6699148416519165, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8774912357330322, + "num_tokens": 878804509.0, + "step": 23033 + }, + { + "epoch": 2.93016155705381, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7416415214538574, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8736168742179871, + "num_tokens": 878840569.0, + "step": 23034 + }, + { + "epoch": 2.9302887673324003, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7866227626800537, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8796109557151794, + "num_tokens": 878874836.0, + "step": 23035 + }, + { + "epoch": 2.930415977610991, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.706478238105774, + "learning_rate": 1e-06, + "loss": 0.3296, + "mean_token_accuracy": 0.8921477794647217, + "num_tokens": 878908002.0, + "step": 23036 + }, + { + "epoch": 2.9305431878895813, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7213116884231567, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8711408376693726, + "num_tokens": 878946168.0, + "step": 23037 + }, + { + "epoch": 2.930670398168172, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.691101312637329, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8834619522094727, + "num_tokens": 878982987.0, + "step": 23038 + }, + { + "epoch": 2.9307976084467624, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5040693283081055, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8823061585426331, + "num_tokens": 879021314.0, + "step": 23039 + }, + { + "epoch": 2.930924818725353, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8054044246673584, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.86277174949646, + "num_tokens": 879054400.0, + "step": 23040 + }, + { + "epoch": 2.9310520290039435, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5678468942642212, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.871540904045105, + "num_tokens": 879091899.0, + "step": 23041 + }, + { + "epoch": 2.931179239282534, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6241939067840576, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8707918524742126, + "num_tokens": 879131990.0, + "step": 23042 + }, + { + "epoch": 2.9313064495611245, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6772799491882324, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8632436394691467, + "num_tokens": 879170452.0, + "step": 23043 + }, + { + "epoch": 2.931433659839715, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5233943462371826, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8770104050636292, + "num_tokens": 879210301.0, + "step": 23044 + }, + { + "epoch": 2.9315608701183056, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.532984972000122, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8721982836723328, + "num_tokens": 879253152.0, + "step": 23045 + }, + { + "epoch": 2.931688080396896, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6917643547058105, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8671243190765381, + "num_tokens": 879289401.0, + "step": 23046 + }, + { + "epoch": 2.9318152906754866, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5432274341583252, + "learning_rate": 1e-06, + "loss": 0.3534, + "mean_token_accuracy": 0.8842451572418213, + "num_tokens": 879329259.0, + "step": 23047 + }, + { + "epoch": 2.931942500954077, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.740821361541748, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.86578369140625, + "num_tokens": 879365725.0, + "step": 23048 + }, + { + "epoch": 2.9320697112326677, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.883539080619812, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8679885864257812, + "num_tokens": 879398499.0, + "step": 23049 + }, + { + "epoch": 2.932196921511258, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7000315189361572, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.871102511882782, + "num_tokens": 879434139.0, + "step": 23050 + }, + { + "epoch": 2.9323241317898487, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6152719259262085, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8811047077178955, + "num_tokens": 879474198.0, + "step": 23051 + }, + { + "epoch": 2.9324513420684393, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6665130853652954, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8649333119392395, + "num_tokens": 879516643.0, + "step": 23052 + }, + { + "epoch": 2.93257855234703, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6617933511734009, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8685104846954346, + "num_tokens": 879553185.0, + "step": 23053 + }, + { + "epoch": 2.93270576262562, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6151857376098633, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8741128444671631, + "num_tokens": 879590343.0, + "step": 23054 + }, + { + "epoch": 2.932832972904211, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7275816202163696, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8585562109947205, + "num_tokens": 879626297.0, + "step": 23055 + }, + { + "epoch": 2.932960183182801, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6291985511779785, + "learning_rate": 1e-06, + "loss": 0.3458, + "mean_token_accuracy": 0.8909206390380859, + "num_tokens": 879663014.0, + "step": 23056 + }, + { + "epoch": 2.933087393461392, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5759236812591553, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8559303283691406, + "num_tokens": 879706261.0, + "step": 23057 + }, + { + "epoch": 2.933214603739982, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7909497022628784, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.883064866065979, + "num_tokens": 879736205.0, + "step": 23058 + }, + { + "epoch": 2.933341814018573, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6295987367630005, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.880634605884552, + "num_tokens": 879774554.0, + "step": 23059 + }, + { + "epoch": 2.933469024297163, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7236970663070679, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8858331441879272, + "num_tokens": 879805318.0, + "step": 23060 + }, + { + "epoch": 2.9335962345757536, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6709784269332886, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8458073735237122, + "num_tokens": 879849107.0, + "step": 23061 + }, + { + "epoch": 2.933723444854344, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6302522420883179, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.87347012758255, + "num_tokens": 879887731.0, + "step": 23062 + }, + { + "epoch": 2.9338506551329346, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4854406118392944, + "learning_rate": 1e-06, + "loss": 0.335, + "mean_token_accuracy": 0.8895428776741028, + "num_tokens": 879926440.0, + "step": 23063 + }, + { + "epoch": 2.933977865411525, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6279544830322266, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8690595030784607, + "num_tokens": 879965281.0, + "step": 23064 + }, + { + "epoch": 2.9341050756901157, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5970102548599243, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8694367408752441, + "num_tokens": 880009552.0, + "step": 23065 + }, + { + "epoch": 2.934232285968706, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5445870161056519, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8793349862098694, + "num_tokens": 880049290.0, + "step": 23066 + }, + { + "epoch": 2.9343594962472968, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6713007688522339, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8724920153617859, + "num_tokens": 880089592.0, + "step": 23067 + }, + { + "epoch": 2.9344867065258873, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5874395370483398, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8582257032394409, + "num_tokens": 880131292.0, + "step": 23068 + }, + { + "epoch": 2.934613916804478, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8194102048873901, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8681561350822449, + "num_tokens": 880165329.0, + "step": 23069 + }, + { + "epoch": 2.9347411270830683, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6410672664642334, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8735193014144897, + "num_tokens": 880201870.0, + "step": 23070 + }, + { + "epoch": 2.934868337361659, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.4525034427642822, + "learning_rate": 1e-06, + "loss": 0.3349, + "mean_token_accuracy": 0.8897546529769897, + "num_tokens": 880245340.0, + "step": 23071 + }, + { + "epoch": 2.9349955476402494, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.769875168800354, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8751479983329773, + "num_tokens": 880278570.0, + "step": 23072 + }, + { + "epoch": 2.93512275791884, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5481343269348145, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8798071146011353, + "num_tokens": 880319779.0, + "step": 23073 + }, + { + "epoch": 2.9352499681974304, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.674526572227478, + "learning_rate": 1e-06, + "loss": 0.3468, + "mean_token_accuracy": 0.8857052326202393, + "num_tokens": 880351675.0, + "step": 23074 + }, + { + "epoch": 2.935377178476021, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5109360218048096, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8703784942626953, + "num_tokens": 880394922.0, + "step": 23075 + }, + { + "epoch": 2.9355043887546115, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6071985960006714, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8756684064865112, + "num_tokens": 880434726.0, + "step": 23076 + }, + { + "epoch": 2.935631599033202, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8476712703704834, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8651036620140076, + "num_tokens": 880469719.0, + "step": 23077 + }, + { + "epoch": 2.9357588093117926, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5387192964553833, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8750995993614197, + "num_tokens": 880512546.0, + "step": 23078 + }, + { + "epoch": 2.9358860195903826, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6308448314666748, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8793075084686279, + "num_tokens": 880548198.0, + "step": 23079 + }, + { + "epoch": 2.9360132298689736, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5778018236160278, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8690564632415771, + "num_tokens": 880588269.0, + "step": 23080 + }, + { + "epoch": 2.9361404401475637, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.720575213432312, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8628462553024292, + "num_tokens": 880623886.0, + "step": 23081 + }, + { + "epoch": 2.9362676504261547, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8142818212509155, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8630161285400391, + "num_tokens": 880662799.0, + "step": 23082 + }, + { + "epoch": 2.9363948607047448, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7940237522125244, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8584735989570618, + "num_tokens": 880696690.0, + "step": 23083 + }, + { + "epoch": 2.9365220709833357, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5653725862503052, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8697755932807922, + "num_tokens": 880740138.0, + "step": 23084 + }, + { + "epoch": 2.936649281261926, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6647424697875977, + "learning_rate": 1e-06, + "loss": 0.3453, + "mean_token_accuracy": 0.8858446478843689, + "num_tokens": 880776242.0, + "step": 23085 + }, + { + "epoch": 2.9367764915405163, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6082366704940796, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8724480867385864, + "num_tokens": 880816322.0, + "step": 23086 + }, + { + "epoch": 2.936903701819107, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.644115686416626, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8804234862327576, + "num_tokens": 880853427.0, + "step": 23087 + }, + { + "epoch": 2.9370309120976974, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7465026378631592, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8847049474716187, + "num_tokens": 880889418.0, + "step": 23088 + }, + { + "epoch": 2.937158122376288, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5998979806900024, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8752312660217285, + "num_tokens": 880931737.0, + "step": 23089 + }, + { + "epoch": 2.9372853326548785, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.52504301071167, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8842527866363525, + "num_tokens": 880969920.0, + "step": 23090 + }, + { + "epoch": 2.937412542933469, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7731853723526, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8764671683311462, + "num_tokens": 881008065.0, + "step": 23091 + }, + { + "epoch": 2.9375397532120595, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6666799783706665, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8674514889717102, + "num_tokens": 881047674.0, + "step": 23092 + }, + { + "epoch": 2.93766696349065, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8105170726776123, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8661642074584961, + "num_tokens": 881087329.0, + "step": 23093 + }, + { + "epoch": 2.9377941737692406, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5643571615219116, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8796151876449585, + "num_tokens": 881125738.0, + "step": 23094 + }, + { + "epoch": 2.937921384047831, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.576505422592163, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8680282831192017, + "num_tokens": 881170452.0, + "step": 23095 + }, + { + "epoch": 2.9380485943264216, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7793958187103271, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8583968877792358, + "num_tokens": 881208850.0, + "step": 23096 + }, + { + "epoch": 2.938175804605012, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5775657892227173, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8827131986618042, + "num_tokens": 881248921.0, + "step": 23097 + }, + { + "epoch": 2.9383030148836027, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6920863389968872, + "learning_rate": 1e-06, + "loss": 0.3464, + "mean_token_accuracy": 0.8851632475852966, + "num_tokens": 881286217.0, + "step": 23098 + }, + { + "epoch": 2.938430225162193, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5147861242294312, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8678888082504272, + "num_tokens": 881329550.0, + "step": 23099 + }, + { + "epoch": 2.9385574354407837, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.676090121269226, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8652166128158569, + "num_tokens": 881366959.0, + "step": 23100 + }, + { + "epoch": 2.9386846457193743, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6135762929916382, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8713263869285583, + "num_tokens": 881409239.0, + "step": 23101 + }, + { + "epoch": 2.938811855997965, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.567046046257019, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8791817426681519, + "num_tokens": 881448065.0, + "step": 23102 + }, + { + "epoch": 2.9389390662765553, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6024391651153564, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8724507093429565, + "num_tokens": 881487486.0, + "step": 23103 + }, + { + "epoch": 2.9390662765551454, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6174770593643188, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8737461566925049, + "num_tokens": 881525981.0, + "step": 23104 + }, + { + "epoch": 2.9391934868337364, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5762145519256592, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8663628697395325, + "num_tokens": 881570815.0, + "step": 23105 + }, + { + "epoch": 2.9393206971123265, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6517139673233032, + "learning_rate": 1e-06, + "loss": 0.3511, + "mean_token_accuracy": 0.8833390474319458, + "num_tokens": 881611990.0, + "step": 23106 + }, + { + "epoch": 2.9394479073909174, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5426716804504395, + "learning_rate": 1e-06, + "loss": 0.3551, + "mean_token_accuracy": 0.8824676871299744, + "num_tokens": 881650116.0, + "step": 23107 + }, + { + "epoch": 2.9395751176695075, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.600112795829773, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.876509964466095, + "num_tokens": 881690848.0, + "step": 23108 + }, + { + "epoch": 2.939702327948098, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8107373714447021, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8737114667892456, + "num_tokens": 881725467.0, + "step": 23109 + }, + { + "epoch": 2.9398295382266886, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5764206647872925, + "learning_rate": 1e-06, + "loss": 0.3115, + "mean_token_accuracy": 0.8980242013931274, + "num_tokens": 881765525.0, + "step": 23110 + }, + { + "epoch": 2.939956748505279, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.607244610786438, + "learning_rate": 1e-06, + "loss": 0.3337, + "mean_token_accuracy": 0.8925163149833679, + "num_tokens": 881801137.0, + "step": 23111 + }, + { + "epoch": 2.9400839587838696, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5891259908676147, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.8779369592666626, + "num_tokens": 881838833.0, + "step": 23112 + }, + { + "epoch": 2.94021116906246, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6139001846313477, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8866841197013855, + "num_tokens": 881877115.0, + "step": 23113 + }, + { + "epoch": 2.9403383793410507, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6941789388656616, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8656374216079712, + "num_tokens": 881916120.0, + "step": 23114 + }, + { + "epoch": 2.940465589619641, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6147129535675049, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8831637501716614, + "num_tokens": 881956486.0, + "step": 23115 + }, + { + "epoch": 2.9405927998982317, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.626564621925354, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8759950399398804, + "num_tokens": 881990783.0, + "step": 23116 + }, + { + "epoch": 2.9407200101768223, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.480177402496338, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8672761917114258, + "num_tokens": 882036226.0, + "step": 23117 + }, + { + "epoch": 2.940847220455413, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6497009992599487, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8865855932235718, + "num_tokens": 882074729.0, + "step": 23118 + }, + { + "epoch": 2.9409744307340033, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5424182415008545, + "learning_rate": 1e-06, + "loss": 0.3344, + "mean_token_accuracy": 0.8936002254486084, + "num_tokens": 882114069.0, + "step": 23119 + }, + { + "epoch": 2.941101641012594, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6040825843811035, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8702771663665771, + "num_tokens": 882154911.0, + "step": 23120 + }, + { + "epoch": 2.9412288512911844, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7341153621673584, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8735672831535339, + "num_tokens": 882189257.0, + "step": 23121 + }, + { + "epoch": 2.941356061569775, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5812960863113403, + "learning_rate": 1e-06, + "loss": 0.3242, + "mean_token_accuracy": 0.8927063345909119, + "num_tokens": 882221103.0, + "step": 23122 + }, + { + "epoch": 2.9414832718483654, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5649182796478271, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.878850519657135, + "num_tokens": 882262333.0, + "step": 23123 + }, + { + "epoch": 2.941610482126956, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6443684101104736, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8754511475563049, + "num_tokens": 882303827.0, + "step": 23124 + }, + { + "epoch": 2.9417376924055465, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6925259828567505, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8667917251586914, + "num_tokens": 882342912.0, + "step": 23125 + }, + { + "epoch": 2.941864902684137, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.856644630432129, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8494322299957275, + "num_tokens": 882377660.0, + "step": 23126 + }, + { + "epoch": 2.941992112962727, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.591675043106079, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.878491997718811, + "num_tokens": 882417688.0, + "step": 23127 + }, + { + "epoch": 2.942119323241318, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6626548767089844, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8662682771682739, + "num_tokens": 882456790.0, + "step": 23128 + }, + { + "epoch": 2.942246533519908, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7642165422439575, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8567514419555664, + "num_tokens": 882496803.0, + "step": 23129 + }, + { + "epoch": 2.942373743798499, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6427605152130127, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8699623942375183, + "num_tokens": 882536863.0, + "step": 23130 + }, + { + "epoch": 2.9425009540770892, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6636885404586792, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8689106702804565, + "num_tokens": 882574489.0, + "step": 23131 + }, + { + "epoch": 2.94262816435568, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6663365364074707, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8664581179618835, + "num_tokens": 882611688.0, + "step": 23132 + }, + { + "epoch": 2.9427553746342703, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5848044157028198, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8595316410064697, + "num_tokens": 882650758.0, + "step": 23133 + }, + { + "epoch": 2.942882584912861, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7373254299163818, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8577625751495361, + "num_tokens": 882688972.0, + "step": 23134 + }, + { + "epoch": 2.9430097951914513, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.693215012550354, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8771945238113403, + "num_tokens": 882723140.0, + "step": 23135 + }, + { + "epoch": 2.943137005470042, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.605549693107605, + "learning_rate": 1e-06, + "loss": 0.3308, + "mean_token_accuracy": 0.8918393850326538, + "num_tokens": 882756187.0, + "step": 23136 + }, + { + "epoch": 2.9432642157486324, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.524653673171997, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.868816614151001, + "num_tokens": 882800419.0, + "step": 23137 + }, + { + "epoch": 2.943391426027223, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5735570192337036, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8758898973464966, + "num_tokens": 882838104.0, + "step": 23138 + }, + { + "epoch": 2.9435186363058135, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7308342456817627, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8675293922424316, + "num_tokens": 882877299.0, + "step": 23139 + }, + { + "epoch": 2.943645846584404, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.725575566291809, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8769243955612183, + "num_tokens": 882909771.0, + "step": 23140 + }, + { + "epoch": 2.9437730568629945, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.704914927482605, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8633737564086914, + "num_tokens": 882946370.0, + "step": 23141 + }, + { + "epoch": 2.943900267141585, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6910299062728882, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.870429277420044, + "num_tokens": 882980454.0, + "step": 23142 + }, + { + "epoch": 2.9440274774201756, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5454668998718262, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.878092348575592, + "num_tokens": 883022904.0, + "step": 23143 + }, + { + "epoch": 2.944154687698766, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6457818746566772, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8783583641052246, + "num_tokens": 883059876.0, + "step": 23144 + }, + { + "epoch": 2.9442818979773566, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5824443101882935, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8649130463600159, + "num_tokens": 883099090.0, + "step": 23145 + }, + { + "epoch": 2.944409108255947, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.712733507156372, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8741624355316162, + "num_tokens": 883136509.0, + "step": 23146 + }, + { + "epoch": 2.9445363185345377, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5687059164047241, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8781793117523193, + "num_tokens": 883176422.0, + "step": 23147 + }, + { + "epoch": 2.944663528813128, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5930960178375244, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8692114949226379, + "num_tokens": 883219226.0, + "step": 23148 + }, + { + "epoch": 2.9447907390917187, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6611220836639404, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8696438074111938, + "num_tokens": 883256957.0, + "step": 23149 + }, + { + "epoch": 2.9449179493703093, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7848262786865234, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8718670606613159, + "num_tokens": 883292324.0, + "step": 23150 + }, + { + "epoch": 2.9450451596489, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5913816690444946, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8749523758888245, + "num_tokens": 883335631.0, + "step": 23151 + }, + { + "epoch": 2.94517236992749, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5476605892181396, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8730021119117737, + "num_tokens": 883376986.0, + "step": 23152 + }, + { + "epoch": 2.945299580206081, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.656470775604248, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8775423765182495, + "num_tokens": 883412071.0, + "step": 23153 + }, + { + "epoch": 2.945426790484671, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.552890658378601, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8629601001739502, + "num_tokens": 883455469.0, + "step": 23154 + }, + { + "epoch": 2.945554000763262, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7073469161987305, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8656219244003296, + "num_tokens": 883489500.0, + "step": 23155 + }, + { + "epoch": 2.945681211041852, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6794514656066895, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8781975507736206, + "num_tokens": 883524258.0, + "step": 23156 + }, + { + "epoch": 2.945808421320443, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5524331331253052, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8744076490402222, + "num_tokens": 883565059.0, + "step": 23157 + }, + { + "epoch": 2.945935631599033, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5907514095306396, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8633185625076294, + "num_tokens": 883606340.0, + "step": 23158 + }, + { + "epoch": 2.9460628418776236, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7004351615905762, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8620229959487915, + "num_tokens": 883643519.0, + "step": 23159 + }, + { + "epoch": 2.946190052156214, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7549827098846436, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8887901902198792, + "num_tokens": 883675651.0, + "step": 23160 + }, + { + "epoch": 2.9463172624348046, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.618900179862976, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.87153559923172, + "num_tokens": 883714611.0, + "step": 23161 + }, + { + "epoch": 2.946444472713395, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.537803053855896, + "learning_rate": 1e-06, + "loss": 0.3223, + "mean_token_accuracy": 0.8941808342933655, + "num_tokens": 883754066.0, + "step": 23162 + }, + { + "epoch": 2.9465716829919857, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5882078409194946, + "learning_rate": 1e-06, + "loss": 0.3549, + "mean_token_accuracy": 0.8838200569152832, + "num_tokens": 883790845.0, + "step": 23163 + }, + { + "epoch": 2.946698893270576, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8104127645492554, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8472371101379395, + "num_tokens": 883827327.0, + "step": 23164 + }, + { + "epoch": 2.9468261035491667, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7112064361572266, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8769913911819458, + "num_tokens": 883861683.0, + "step": 23165 + }, + { + "epoch": 2.9469533138277573, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6595983505249023, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8691872358322144, + "num_tokens": 883900747.0, + "step": 23166 + }, + { + "epoch": 2.947080524106348, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6110953092575073, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8693794012069702, + "num_tokens": 883938290.0, + "step": 23167 + }, + { + "epoch": 2.9472077343849383, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.708486795425415, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8703702688217163, + "num_tokens": 883974611.0, + "step": 23168 + }, + { + "epoch": 2.947334944663529, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6799801588058472, + "learning_rate": 1e-06, + "loss": 0.3433, + "mean_token_accuracy": 0.8890682458877563, + "num_tokens": 884007142.0, + "step": 23169 + }, + { + "epoch": 2.9474621549421194, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6498736143112183, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8840256929397583, + "num_tokens": 884045030.0, + "step": 23170 + }, + { + "epoch": 2.94758936522071, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8052639961242676, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8616540431976318, + "num_tokens": 884084827.0, + "step": 23171 + }, + { + "epoch": 2.9477165754993004, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5778106451034546, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8652479648590088, + "num_tokens": 884127123.0, + "step": 23172 + }, + { + "epoch": 2.947843785777891, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5833332538604736, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8816624283790588, + "num_tokens": 884164631.0, + "step": 23173 + }, + { + "epoch": 2.9479709960564815, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6709779500961304, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8749760985374451, + "num_tokens": 884200931.0, + "step": 23174 + }, + { + "epoch": 2.948098206335072, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.67720627784729, + "learning_rate": 1e-06, + "loss": 0.3292, + "mean_token_accuracy": 0.894279956817627, + "num_tokens": 884232970.0, + "step": 23175 + }, + { + "epoch": 2.9482254166136626, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6219184398651123, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8743890523910522, + "num_tokens": 884271196.0, + "step": 23176 + }, + { + "epoch": 2.9483526268922526, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7516738176345825, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8637080192565918, + "num_tokens": 884305116.0, + "step": 23177 + }, + { + "epoch": 2.9484798371708436, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7761980295181274, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8648806810379028, + "num_tokens": 884339490.0, + "step": 23178 + }, + { + "epoch": 2.9486070474494337, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.770156741142273, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8760520219802856, + "num_tokens": 884372382.0, + "step": 23179 + }, + { + "epoch": 2.9487342577280247, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5777523517608643, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8662705421447754, + "num_tokens": 884414568.0, + "step": 23180 + }, + { + "epoch": 2.9488614680066147, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6124918460845947, + "learning_rate": 1e-06, + "loss": 0.3443, + "mean_token_accuracy": 0.8896105885505676, + "num_tokens": 884453083.0, + "step": 23181 + }, + { + "epoch": 2.9489886782852053, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7218248844146729, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8611060380935669, + "num_tokens": 884490933.0, + "step": 23182 + }, + { + "epoch": 2.949115888563796, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.64620840549469, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8697417974472046, + "num_tokens": 884529043.0, + "step": 23183 + }, + { + "epoch": 2.9492430988423863, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6595067977905273, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8673193454742432, + "num_tokens": 884568123.0, + "step": 23184 + }, + { + "epoch": 2.949370309120977, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6575076580047607, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8882492780685425, + "num_tokens": 884601702.0, + "step": 23185 + }, + { + "epoch": 2.9494975193995674, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5609962940216064, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8743140697479248, + "num_tokens": 884642234.0, + "step": 23186 + }, + { + "epoch": 2.949624729678158, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5970420837402344, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8669497966766357, + "num_tokens": 884682398.0, + "step": 23187 + }, + { + "epoch": 2.9497519399567484, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5805838108062744, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8770889043807983, + "num_tokens": 884722079.0, + "step": 23188 + }, + { + "epoch": 2.949879150235339, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6610102653503418, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8624265789985657, + "num_tokens": 884758307.0, + "step": 23189 + }, + { + "epoch": 2.9500063605139295, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6043165922164917, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8811995983123779, + "num_tokens": 884798059.0, + "step": 23190 + }, + { + "epoch": 2.95013357079252, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5906200408935547, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8734269142150879, + "num_tokens": 884837385.0, + "step": 23191 + }, + { + "epoch": 2.9502607810711106, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7970523834228516, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8855718374252319, + "num_tokens": 884865436.0, + "step": 23192 + }, + { + "epoch": 2.950387991349701, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5237504243850708, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8791148066520691, + "num_tokens": 884906228.0, + "step": 23193 + }, + { + "epoch": 2.9505152016282916, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6425715684890747, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8789916634559631, + "num_tokens": 884945649.0, + "step": 23194 + }, + { + "epoch": 2.950642411906882, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6289499998092651, + "learning_rate": 1e-06, + "loss": 0.3339, + "mean_token_accuracy": 0.8934428095817566, + "num_tokens": 884982134.0, + "step": 23195 + }, + { + "epoch": 2.9507696221854727, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7659255266189575, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8794898390769958, + "num_tokens": 885013117.0, + "step": 23196 + }, + { + "epoch": 2.950896832464063, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6072343587875366, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.877449631690979, + "num_tokens": 885054939.0, + "step": 23197 + }, + { + "epoch": 2.9510240427426537, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6479812860488892, + "learning_rate": 1e-06, + "loss": 0.3497, + "mean_token_accuracy": 0.8871227502822876, + "num_tokens": 885087796.0, + "step": 23198 + }, + { + "epoch": 2.9511512530212443, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8447682857513428, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8639127016067505, + "num_tokens": 885120084.0, + "step": 23199 + }, + { + "epoch": 2.951278463299835, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5222511291503906, + "learning_rate": 1e-06, + "loss": 0.3279, + "mean_token_accuracy": 0.89278244972229, + "num_tokens": 885159323.0, + "step": 23200 + }, + { + "epoch": 2.9514056735784253, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7469109296798706, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8653955459594727, + "num_tokens": 885195844.0, + "step": 23201 + }, + { + "epoch": 2.9515328838570154, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6655787229537964, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8697031140327454, + "num_tokens": 885233291.0, + "step": 23202 + }, + { + "epoch": 2.9516600941356064, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.74213445186615, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8586893081665039, + "num_tokens": 885268768.0, + "step": 23203 + }, + { + "epoch": 2.9517873044141965, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7994860410690308, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8717374801635742, + "num_tokens": 885302335.0, + "step": 23204 + }, + { + "epoch": 2.9519145146927874, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5087330341339111, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8738980293273926, + "num_tokens": 885349253.0, + "step": 23205 + }, + { + "epoch": 2.9520417249713775, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6770215034484863, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.878341794013977, + "num_tokens": 885384608.0, + "step": 23206 + }, + { + "epoch": 2.952168935249968, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5748050212860107, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8772541284561157, + "num_tokens": 885421619.0, + "step": 23207 + }, + { + "epoch": 2.9522961455285586, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7079198360443115, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8752856254577637, + "num_tokens": 885454173.0, + "step": 23208 + }, + { + "epoch": 2.952423355807149, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5812773704528809, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.878320574760437, + "num_tokens": 885491534.0, + "step": 23209 + }, + { + "epoch": 2.9525505660857396, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6415185928344727, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8756377696990967, + "num_tokens": 885530262.0, + "step": 23210 + }, + { + "epoch": 2.95267777636433, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6589021682739258, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8715689182281494, + "num_tokens": 885568672.0, + "step": 23211 + }, + { + "epoch": 2.9528049866429207, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6803104877471924, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8614761233329773, + "num_tokens": 885604638.0, + "step": 23212 + }, + { + "epoch": 2.952932196921511, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.786540150642395, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8775999546051025, + "num_tokens": 885638532.0, + "step": 23213 + }, + { + "epoch": 2.9530594072001017, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5036249160766602, + "learning_rate": 1e-06, + "loss": 0.3268, + "mean_token_accuracy": 0.894204318523407, + "num_tokens": 885679440.0, + "step": 23214 + }, + { + "epoch": 2.9531866174786923, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6941289901733398, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8826134204864502, + "num_tokens": 885715426.0, + "step": 23215 + }, + { + "epoch": 2.953313827757283, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7334319353103638, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8689968585968018, + "num_tokens": 885753335.0, + "step": 23216 + }, + { + "epoch": 2.9534410380358733, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7472953796386719, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8638251423835754, + "num_tokens": 885792223.0, + "step": 23217 + }, + { + "epoch": 2.953568248314464, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6828101873397827, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8774856925010681, + "num_tokens": 885827501.0, + "step": 23218 + }, + { + "epoch": 2.9536954585930544, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5866100788116455, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8836520314216614, + "num_tokens": 885865206.0, + "step": 23219 + }, + { + "epoch": 2.953822668871645, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7259669303894043, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.87298583984375, + "num_tokens": 885898580.0, + "step": 23220 + }, + { + "epoch": 2.9539498791502354, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6698317527770996, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8745923638343811, + "num_tokens": 885933673.0, + "step": 23221 + }, + { + "epoch": 2.954077089428826, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.8408650159835815, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8673649430274963, + "num_tokens": 885966577.0, + "step": 23222 + }, + { + "epoch": 2.9542042997074165, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.4733855724334717, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8768893480300903, + "num_tokens": 886012070.0, + "step": 23223 + }, + { + "epoch": 2.954331509986007, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.848918080329895, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8681021928787231, + "num_tokens": 886045003.0, + "step": 23224 + }, + { + "epoch": 2.954458720264597, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6914222240447998, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8758672475814819, + "num_tokens": 886080346.0, + "step": 23225 + }, + { + "epoch": 2.954585930543188, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.8644288778305054, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8653581142425537, + "num_tokens": 886119589.0, + "step": 23226 + }, + { + "epoch": 2.954713140821778, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7074693441390991, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8766947388648987, + "num_tokens": 886154462.0, + "step": 23227 + }, + { + "epoch": 2.954840351100369, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6524688005447388, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8780120015144348, + "num_tokens": 886188437.0, + "step": 23228 + }, + { + "epoch": 2.954967561378959, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5612168312072754, + "learning_rate": 1e-06, + "loss": 0.3381, + "mean_token_accuracy": 0.887976884841919, + "num_tokens": 886225347.0, + "step": 23229 + }, + { + "epoch": 2.95509477165755, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.722018837928772, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8834589719772339, + "num_tokens": 886259981.0, + "step": 23230 + }, + { + "epoch": 2.9552219819361403, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7046579122543335, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8659185171127319, + "num_tokens": 886296672.0, + "step": 23231 + }, + { + "epoch": 2.955349192214731, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.706719994544983, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.853920578956604, + "num_tokens": 886336891.0, + "step": 23232 + }, + { + "epoch": 2.9554764024933213, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.8432484865188599, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8638538718223572, + "num_tokens": 886370277.0, + "step": 23233 + }, + { + "epoch": 2.955603612771912, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7818396091461182, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8613842725753784, + "num_tokens": 886403792.0, + "step": 23234 + }, + { + "epoch": 2.9557308230505024, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5978872776031494, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8783310651779175, + "num_tokens": 886446528.0, + "step": 23235 + }, + { + "epoch": 2.955858033329093, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6847093105316162, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.879522442817688, + "num_tokens": 886484391.0, + "step": 23236 + }, + { + "epoch": 2.9559852436076834, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7590742111206055, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8607878088951111, + "num_tokens": 886524622.0, + "step": 23237 + }, + { + "epoch": 2.956112453886274, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5448871850967407, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.882201075553894, + "num_tokens": 886566921.0, + "step": 23238 + }, + { + "epoch": 2.9562396641648645, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7194006443023682, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.862647533416748, + "num_tokens": 886604466.0, + "step": 23239 + }, + { + "epoch": 2.956366874443455, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5437930822372437, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8729125261306763, + "num_tokens": 886647103.0, + "step": 23240 + }, + { + "epoch": 2.9564940847220456, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5748000144958496, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8719030618667603, + "num_tokens": 886688189.0, + "step": 23241 + }, + { + "epoch": 2.956621295000636, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.830949306488037, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8855266571044922, + "num_tokens": 886722244.0, + "step": 23242 + }, + { + "epoch": 2.9567485052792266, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7753897905349731, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8779703378677368, + "num_tokens": 886760360.0, + "step": 23243 + }, + { + "epoch": 2.956875715557817, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6554043292999268, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8520256280899048, + "num_tokens": 886801924.0, + "step": 23244 + }, + { + "epoch": 2.9570029258364077, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7397770881652832, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8735964298248291, + "num_tokens": 886839319.0, + "step": 23245 + }, + { + "epoch": 2.957130136114998, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.566326379776001, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8827351331710815, + "num_tokens": 886879107.0, + "step": 23246 + }, + { + "epoch": 2.9572573463935887, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7344661951065063, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8808768391609192, + "num_tokens": 886912349.0, + "step": 23247 + }, + { + "epoch": 2.9573845566721793, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8935319185256958, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8726187944412231, + "num_tokens": 886943048.0, + "step": 23248 + }, + { + "epoch": 2.95751176695077, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.618026614189148, + "learning_rate": 1e-06, + "loss": 0.3495, + "mean_token_accuracy": 0.8856459856033325, + "num_tokens": 886978461.0, + "step": 23249 + }, + { + "epoch": 2.95763897722936, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.8201946020126343, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.886006236076355, + "num_tokens": 887012316.0, + "step": 23250 + }, + { + "epoch": 2.957766187507951, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7808984518051147, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8749567270278931, + "num_tokens": 887046297.0, + "step": 23251 + }, + { + "epoch": 2.957893397786541, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7830591201782227, + "learning_rate": 1e-06, + "loss": 0.3347, + "mean_token_accuracy": 0.888614296913147, + "num_tokens": 887078648.0, + "step": 23252 + }, + { + "epoch": 2.958020608065132, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7248106002807617, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8847454786300659, + "num_tokens": 887117435.0, + "step": 23253 + }, + { + "epoch": 2.958147818343722, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5390695333480835, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8777827024459839, + "num_tokens": 887159207.0, + "step": 23254 + }, + { + "epoch": 2.958275028622313, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.76361882686615, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.870908260345459, + "num_tokens": 887192142.0, + "step": 23255 + }, + { + "epoch": 2.958402238900903, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5948618650436401, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.862861156463623, + "num_tokens": 887232358.0, + "step": 23256 + }, + { + "epoch": 2.9585294491794936, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7158008813858032, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8792685866355896, + "num_tokens": 887268083.0, + "step": 23257 + }, + { + "epoch": 2.958656659458084, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7303690910339355, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8749616742134094, + "num_tokens": 887302971.0, + "step": 23258 + }, + { + "epoch": 2.9587838697366746, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.628818392753601, + "learning_rate": 1e-06, + "loss": 0.3559, + "mean_token_accuracy": 0.8858834505081177, + "num_tokens": 887344825.0, + "step": 23259 + }, + { + "epoch": 2.958911080015265, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7198621034622192, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8656229972839355, + "num_tokens": 887381371.0, + "step": 23260 + }, + { + "epoch": 2.9590382902938557, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5517559051513672, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8767845630645752, + "num_tokens": 887419257.0, + "step": 23261 + }, + { + "epoch": 2.959165500572446, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5702310800552368, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8759163618087769, + "num_tokens": 887461009.0, + "step": 23262 + }, + { + "epoch": 2.9592927108510367, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6289163827896118, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8769843578338623, + "num_tokens": 887503145.0, + "step": 23263 + }, + { + "epoch": 2.9594199211296273, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6515002250671387, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8798465728759766, + "num_tokens": 887539398.0, + "step": 23264 + }, + { + "epoch": 2.959547131408218, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6427925825119019, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8797120451927185, + "num_tokens": 887578819.0, + "step": 23265 + }, + { + "epoch": 2.9596743416868083, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6288357973098755, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8653922080993652, + "num_tokens": 887621689.0, + "step": 23266 + }, + { + "epoch": 2.959801551965399, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5942925214767456, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8810113668441772, + "num_tokens": 887660838.0, + "step": 23267 + }, + { + "epoch": 2.9599287622439894, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7602089643478394, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.871675968170166, + "num_tokens": 887694392.0, + "step": 23268 + }, + { + "epoch": 2.96005597252258, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5400443077087402, + "learning_rate": 1e-06, + "loss": 0.3203, + "mean_token_accuracy": 0.8973480463027954, + "num_tokens": 887733540.0, + "step": 23269 + }, + { + "epoch": 2.9601831828011704, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7117736339569092, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8626103401184082, + "num_tokens": 887772011.0, + "step": 23270 + }, + { + "epoch": 2.960310393079761, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4774469137191772, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8813343644142151, + "num_tokens": 887815934.0, + "step": 23271 + }, + { + "epoch": 2.9604376033583515, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7579624652862549, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8797404766082764, + "num_tokens": 887848678.0, + "step": 23272 + }, + { + "epoch": 2.960564813636942, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6637272834777832, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8557090759277344, + "num_tokens": 887889173.0, + "step": 23273 + }, + { + "epoch": 2.9606920239155325, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6062029600143433, + "learning_rate": 1e-06, + "loss": 0.3405, + "mean_token_accuracy": 0.8904834985733032, + "num_tokens": 887925349.0, + "step": 23274 + }, + { + "epoch": 2.9608192341941226, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.529409646987915, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8714802861213684, + "num_tokens": 887966696.0, + "step": 23275 + }, + { + "epoch": 2.9609464444727136, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6867549419403076, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8601774573326111, + "num_tokens": 888006458.0, + "step": 23276 + }, + { + "epoch": 2.9610736547513037, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5233981609344482, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8798922300338745, + "num_tokens": 888047875.0, + "step": 23277 + }, + { + "epoch": 2.9612008650298947, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.606195092201233, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8847806453704834, + "num_tokens": 888086157.0, + "step": 23278 + }, + { + "epoch": 2.9613280753084847, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.378122329711914, + "learning_rate": 1e-06, + "loss": 0.3373, + "mean_token_accuracy": 0.8907118439674377, + "num_tokens": 888131441.0, + "step": 23279 + }, + { + "epoch": 2.9614552855870753, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5452642440795898, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8661007881164551, + "num_tokens": 888173812.0, + "step": 23280 + }, + { + "epoch": 2.961582495865666, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6016404628753662, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8744521141052246, + "num_tokens": 888213794.0, + "step": 23281 + }, + { + "epoch": 2.9617097061442563, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 2.229004144668579, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8774710893630981, + "num_tokens": 888245033.0, + "step": 23282 + }, + { + "epoch": 2.961836916422847, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5671639442443848, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8751264810562134, + "num_tokens": 888283781.0, + "step": 23283 + }, + { + "epoch": 2.9619641267014374, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6697697639465332, + "learning_rate": 1e-06, + "loss": 0.3571, + "mean_token_accuracy": 0.8858110904693604, + "num_tokens": 888318124.0, + "step": 23284 + }, + { + "epoch": 2.962091336980028, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5797593593597412, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8859320878982544, + "num_tokens": 888355151.0, + "step": 23285 + }, + { + "epoch": 2.9622185472586184, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.505118727684021, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.888728141784668, + "num_tokens": 888396954.0, + "step": 23286 + }, + { + "epoch": 2.962345757537209, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7447038888931274, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8633236885070801, + "num_tokens": 888435725.0, + "step": 23287 + }, + { + "epoch": 2.9624729678157995, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5258296728134155, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8861621618270874, + "num_tokens": 888475348.0, + "step": 23288 + }, + { + "epoch": 2.96260017809439, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.806442379951477, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8830156922340393, + "num_tokens": 888509832.0, + "step": 23289 + }, + { + "epoch": 2.9627273883729806, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5897284746170044, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8568293452262878, + "num_tokens": 888555031.0, + "step": 23290 + }, + { + "epoch": 2.962854598651571, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5679570436477661, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8826033473014832, + "num_tokens": 888593983.0, + "step": 23291 + }, + { + "epoch": 2.9629818089301616, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6754895448684692, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8631424903869629, + "num_tokens": 888632933.0, + "step": 23292 + }, + { + "epoch": 2.963109019208752, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6304436922073364, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8720241785049438, + "num_tokens": 888670676.0, + "step": 23293 + }, + { + "epoch": 2.9632362294873427, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5465872287750244, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8767858743667603, + "num_tokens": 888711792.0, + "step": 23294 + }, + { + "epoch": 2.963363439765933, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7680120468139648, + "learning_rate": 1e-06, + "loss": 0.3485, + "mean_token_accuracy": 0.8866570591926575, + "num_tokens": 888742156.0, + "step": 23295 + }, + { + "epoch": 2.9634906500445237, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7686049938201904, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8833773732185364, + "num_tokens": 888771478.0, + "step": 23296 + }, + { + "epoch": 2.9636178603231143, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5216280221939087, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8791817426681519, + "num_tokens": 888813341.0, + "step": 23297 + }, + { + "epoch": 2.963745070601705, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6505440473556519, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8672044277191162, + "num_tokens": 888853136.0, + "step": 23298 + }, + { + "epoch": 2.9638722808802953, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.547592043876648, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8791534900665283, + "num_tokens": 888892331.0, + "step": 23299 + }, + { + "epoch": 2.9639994911588854, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5483739376068115, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8803502321243286, + "num_tokens": 888931363.0, + "step": 23300 + }, + { + "epoch": 2.9641267014374764, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.688507318496704, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8549294471740723, + "num_tokens": 888967228.0, + "step": 23301 + }, + { + "epoch": 2.9642539117160664, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.671988606452942, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8758563995361328, + "num_tokens": 889004330.0, + "step": 23302 + }, + { + "epoch": 2.9643811219946574, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.697767972946167, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8626211285591125, + "num_tokens": 889042379.0, + "step": 23303 + }, + { + "epoch": 2.9645083322732475, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7414640188217163, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8772997260093689, + "num_tokens": 889076376.0, + "step": 23304 + }, + { + "epoch": 2.964635542551838, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6598867177963257, + "learning_rate": 1e-06, + "loss": 0.3426, + "mean_token_accuracy": 0.8908681869506836, + "num_tokens": 889108797.0, + "step": 23305 + }, + { + "epoch": 2.9647627528304286, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7103493213653564, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8822842836380005, + "num_tokens": 889148266.0, + "step": 23306 + }, + { + "epoch": 2.964889963109019, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.703504204750061, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8721827268600464, + "num_tokens": 889185310.0, + "step": 23307 + }, + { + "epoch": 2.9650171733876096, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6002155542373657, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8618351221084595, + "num_tokens": 889228692.0, + "step": 23308 + }, + { + "epoch": 2.9651443836662, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 4.648996353149414, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8659179210662842, + "num_tokens": 889266010.0, + "step": 23309 + }, + { + "epoch": 2.9652715939447907, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5959781408309937, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8794037103652954, + "num_tokens": 889300914.0, + "step": 23310 + }, + { + "epoch": 2.965398804223381, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5865610837936401, + "learning_rate": 1e-06, + "loss": 0.3469, + "mean_token_accuracy": 0.8879645466804504, + "num_tokens": 889341248.0, + "step": 23311 + }, + { + "epoch": 2.9655260145019717, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5852805376052856, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8759449124336243, + "num_tokens": 889381244.0, + "step": 23312 + }, + { + "epoch": 2.9656532247805623, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5830466747283936, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8881584405899048, + "num_tokens": 889418866.0, + "step": 23313 + }, + { + "epoch": 2.965780435059153, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6176844835281372, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8773041367530823, + "num_tokens": 889458386.0, + "step": 23314 + }, + { + "epoch": 2.9659076453377433, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6240625381469727, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.861990213394165, + "num_tokens": 889498721.0, + "step": 23315 + }, + { + "epoch": 2.966034855616334, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7341917753219604, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8682916164398193, + "num_tokens": 889533688.0, + "step": 23316 + }, + { + "epoch": 2.9661620658949244, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5449637174606323, + "learning_rate": 1e-06, + "loss": 0.3609, + "mean_token_accuracy": 0.8841835856437683, + "num_tokens": 889577034.0, + "step": 23317 + }, + { + "epoch": 2.966289276173515, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7185159921646118, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8638356924057007, + "num_tokens": 889615666.0, + "step": 23318 + }, + { + "epoch": 2.9664164864521054, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6335099935531616, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8781660795211792, + "num_tokens": 889653801.0, + "step": 23319 + }, + { + "epoch": 2.966543696730696, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6666508913040161, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8699862957000732, + "num_tokens": 889692422.0, + "step": 23320 + }, + { + "epoch": 2.9666709070092865, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6793320178985596, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8681873083114624, + "num_tokens": 889726130.0, + "step": 23321 + }, + { + "epoch": 2.966798117287877, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 2.249849557876587, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8917511701583862, + "num_tokens": 889761860.0, + "step": 23322 + }, + { + "epoch": 2.966925327566467, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.586734414100647, + "learning_rate": 1e-06, + "loss": 0.3519, + "mean_token_accuracy": 0.8826804161071777, + "num_tokens": 889799430.0, + "step": 23323 + }, + { + "epoch": 2.967052537845058, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5343246459960938, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8815032243728638, + "num_tokens": 889838353.0, + "step": 23324 + }, + { + "epoch": 2.967179748123648, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6020874977111816, + "learning_rate": 1e-06, + "loss": 0.3457, + "mean_token_accuracy": 0.8870246410369873, + "num_tokens": 889873308.0, + "step": 23325 + }, + { + "epoch": 2.967306958402239, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7231370210647583, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8758689165115356, + "num_tokens": 889905729.0, + "step": 23326 + }, + { + "epoch": 2.967434168680829, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.4897544384002686, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8806641697883606, + "num_tokens": 889947645.0, + "step": 23327 + }, + { + "epoch": 2.96756137895942, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6038386821746826, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8741549849510193, + "num_tokens": 889984434.0, + "step": 23328 + }, + { + "epoch": 2.9676885892380103, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6338196992874146, + "learning_rate": 1e-06, + "loss": 0.3516, + "mean_token_accuracy": 0.8895877599716187, + "num_tokens": 890016893.0, + "step": 23329 + }, + { + "epoch": 2.967815799516601, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6640762090682983, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8462547659873962, + "num_tokens": 890062050.0, + "step": 23330 + }, + { + "epoch": 2.9679430097951913, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6458749771118164, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8710764646530151, + "num_tokens": 890100515.0, + "step": 23331 + }, + { + "epoch": 2.968070220073782, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5129148960113525, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8751106262207031, + "num_tokens": 890140536.0, + "step": 23332 + }, + { + "epoch": 2.9681974303523724, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7121034860610962, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8663997054100037, + "num_tokens": 890177305.0, + "step": 23333 + }, + { + "epoch": 2.968324640630963, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 2.2371790409088135, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8727903366088867, + "num_tokens": 890216598.0, + "step": 23334 + }, + { + "epoch": 2.9684518509095534, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6525158882141113, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8809677362442017, + "num_tokens": 890253890.0, + "step": 23335 + }, + { + "epoch": 2.968579061188144, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.573262095451355, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8820653557777405, + "num_tokens": 890291690.0, + "step": 23336 + }, + { + "epoch": 2.9687062714667345, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6389602422714233, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8763300180435181, + "num_tokens": 890330442.0, + "step": 23337 + }, + { + "epoch": 2.968833481745325, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.661087155342102, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8771299719810486, + "num_tokens": 890366806.0, + "step": 23338 + }, + { + "epoch": 2.9689606920239155, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7258350849151611, + "learning_rate": 1e-06, + "loss": 0.3591, + "mean_token_accuracy": 0.8825992941856384, + "num_tokens": 890397734.0, + "step": 23339 + }, + { + "epoch": 2.969087902302506, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7051328420639038, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8708646893501282, + "num_tokens": 890435697.0, + "step": 23340 + }, + { + "epoch": 2.9692151125810966, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7563307285308838, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8648304343223572, + "num_tokens": 890472627.0, + "step": 23341 + }, + { + "epoch": 2.969342322859687, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7369014024734497, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8725171089172363, + "num_tokens": 890510130.0, + "step": 23342 + }, + { + "epoch": 2.9694695331382777, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5995776653289795, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8834680914878845, + "num_tokens": 890549177.0, + "step": 23343 + }, + { + "epoch": 2.969596743416868, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5595242977142334, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8845946788787842, + "num_tokens": 890589671.0, + "step": 23344 + }, + { + "epoch": 2.9697239536954587, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5427446365356445, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8859803676605225, + "num_tokens": 890629841.0, + "step": 23345 + }, + { + "epoch": 2.9698511639740492, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5329532623291016, + "learning_rate": 1e-06, + "loss": 0.3357, + "mean_token_accuracy": 0.8918741345405579, + "num_tokens": 890668279.0, + "step": 23346 + }, + { + "epoch": 2.9699783742526398, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6601133346557617, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.871495246887207, + "num_tokens": 890705039.0, + "step": 23347 + }, + { + "epoch": 2.97010558453123, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6715034246444702, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8696591854095459, + "num_tokens": 890738092.0, + "step": 23348 + }, + { + "epoch": 2.970232794809821, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.689395785331726, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8771542310714722, + "num_tokens": 890773434.0, + "step": 23349 + }, + { + "epoch": 2.970360005088411, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5603907108306885, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8842002153396606, + "num_tokens": 890811661.0, + "step": 23350 + }, + { + "epoch": 2.970487215367002, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7228766679763794, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.873584508895874, + "num_tokens": 890848263.0, + "step": 23351 + }, + { + "epoch": 2.970614425645592, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5893555879592896, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8900367021560669, + "num_tokens": 890884532.0, + "step": 23352 + }, + { + "epoch": 2.970741635924183, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6135653257369995, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8568426370620728, + "num_tokens": 890928964.0, + "step": 23353 + }, + { + "epoch": 2.970868846202773, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6811800003051758, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8672015070915222, + "num_tokens": 890969360.0, + "step": 23354 + }, + { + "epoch": 2.9709960564813636, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7418254613876343, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8762876391410828, + "num_tokens": 891002241.0, + "step": 23355 + }, + { + "epoch": 2.971123266759954, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5037144422531128, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8815482258796692, + "num_tokens": 891042355.0, + "step": 23356 + }, + { + "epoch": 2.9712504770385446, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.729387879371643, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8561172485351562, + "num_tokens": 891077117.0, + "step": 23357 + }, + { + "epoch": 2.971377687317135, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5685949325561523, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8765833377838135, + "num_tokens": 891115000.0, + "step": 23358 + }, + { + "epoch": 2.9715048975957257, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5769270658493042, + "learning_rate": 1e-06, + "loss": 0.3367, + "mean_token_accuracy": 0.8912506103515625, + "num_tokens": 891153596.0, + "step": 23359 + }, + { + "epoch": 2.971632107874316, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.600865125656128, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8845105171203613, + "num_tokens": 891191481.0, + "step": 23360 + }, + { + "epoch": 2.9717593181529067, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6730666160583496, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8616735339164734, + "num_tokens": 891229801.0, + "step": 23361 + }, + { + "epoch": 2.9718865284314973, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6568779945373535, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8810914158821106, + "num_tokens": 891268919.0, + "step": 23362 + }, + { + "epoch": 2.972013738710088, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6300818920135498, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8837064504623413, + "num_tokens": 891307337.0, + "step": 23363 + }, + { + "epoch": 2.9721409489886783, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5930365324020386, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8658490180969238, + "num_tokens": 891347918.0, + "step": 23364 + }, + { + "epoch": 2.972268159267269, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.61172616481781, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8833590745925903, + "num_tokens": 891385077.0, + "step": 23365 + }, + { + "epoch": 2.9723953695458594, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6701107025146484, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8601197004318237, + "num_tokens": 891424370.0, + "step": 23366 + }, + { + "epoch": 2.97252257982445, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7662361860275269, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.853486180305481, + "num_tokens": 891460845.0, + "step": 23367 + }, + { + "epoch": 2.9726497901030404, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.674497365951538, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8461210131645203, + "num_tokens": 891500999.0, + "step": 23368 + }, + { + "epoch": 2.972777000381631, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.585316777229309, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8673399090766907, + "num_tokens": 891541834.0, + "step": 23369 + }, + { + "epoch": 2.9729042106602215, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6080745458602905, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.871669352054596, + "num_tokens": 891581036.0, + "step": 23370 + }, + { + "epoch": 2.973031420938812, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.729828953742981, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8674609065055847, + "num_tokens": 891616454.0, + "step": 23371 + }, + { + "epoch": 2.9731586312174025, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 3.703242778778076, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8650268316268921, + "num_tokens": 891654890.0, + "step": 23372 + }, + { + "epoch": 2.9732858414959926, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6194043159484863, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8842028379440308, + "num_tokens": 891692614.0, + "step": 23373 + }, + { + "epoch": 2.9734130517745836, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6925400495529175, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8677835464477539, + "num_tokens": 891730230.0, + "step": 23374 + }, + { + "epoch": 2.9735402620531737, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.654321551322937, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8709012866020203, + "num_tokens": 891765915.0, + "step": 23375 + }, + { + "epoch": 2.9736674723317646, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6428358554840088, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8754832744598389, + "num_tokens": 891806581.0, + "step": 23376 + }, + { + "epoch": 2.9737946826103547, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5628657341003418, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8720083236694336, + "num_tokens": 891849395.0, + "step": 23377 + }, + { + "epoch": 2.9739218928889453, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6018868684768677, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8891092538833618, + "num_tokens": 891890110.0, + "step": 23378 + }, + { + "epoch": 2.974049103167536, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.482297420501709, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8749428987503052, + "num_tokens": 891932452.0, + "step": 23379 + }, + { + "epoch": 2.9741763134461263, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8064357042312622, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8722116947174072, + "num_tokens": 891962135.0, + "step": 23380 + }, + { + "epoch": 2.974303523724717, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6718131303787231, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8773962259292603, + "num_tokens": 891996286.0, + "step": 23381 + }, + { + "epoch": 2.9744307340033074, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.592099666595459, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.880723237991333, + "num_tokens": 892032965.0, + "step": 23382 + }, + { + "epoch": 2.974557944281898, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6861480474472046, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8771973848342896, + "num_tokens": 892070590.0, + "step": 23383 + }, + { + "epoch": 2.9746851545604884, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6178996562957764, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8846673369407654, + "num_tokens": 892105741.0, + "step": 23384 + }, + { + "epoch": 2.974812364839079, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.785145878791809, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8620810508728027, + "num_tokens": 892141225.0, + "step": 23385 + }, + { + "epoch": 2.9749395751176695, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.723320484161377, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8711235523223877, + "num_tokens": 892174319.0, + "step": 23386 + }, + { + "epoch": 2.97506678539626, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6770753860473633, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8686322569847107, + "num_tokens": 892211311.0, + "step": 23387 + }, + { + "epoch": 2.9751939956748505, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7949206829071045, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8721926808357239, + "num_tokens": 892240909.0, + "step": 23388 + }, + { + "epoch": 2.975321205953441, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.9893183708190918, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8614282608032227, + "num_tokens": 892271584.0, + "step": 23389 + }, + { + "epoch": 2.9754484162320316, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6226820945739746, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8805413246154785, + "num_tokens": 892309823.0, + "step": 23390 + }, + { + "epoch": 2.975575626510622, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.675632119178772, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8681249022483826, + "num_tokens": 892347701.0, + "step": 23391 + }, + { + "epoch": 2.9757028367892127, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.618836522102356, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8664208650588989, + "num_tokens": 892389430.0, + "step": 23392 + }, + { + "epoch": 2.975830047067803, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.561481237411499, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8846843242645264, + "num_tokens": 892425620.0, + "step": 23393 + }, + { + "epoch": 2.9759572573463937, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6288999319076538, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8774352073669434, + "num_tokens": 892463443.0, + "step": 23394 + }, + { + "epoch": 2.9760844676249842, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.622466802597046, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8840103149414062, + "num_tokens": 892503441.0, + "step": 23395 + }, + { + "epoch": 2.9762116779035748, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7712156772613525, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8768739700317383, + "num_tokens": 892539059.0, + "step": 23396 + }, + { + "epoch": 2.9763388881821653, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7015478610992432, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8807180523872375, + "num_tokens": 892572179.0, + "step": 23397 + }, + { + "epoch": 2.9764660984607554, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.48146390914917, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8863410949707031, + "num_tokens": 892617682.0, + "step": 23398 + }, + { + "epoch": 2.9765933087393464, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.4478447437286377, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8853540420532227, + "num_tokens": 892663338.0, + "step": 23399 + }, + { + "epoch": 2.9767205190179364, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6268357038497925, + "learning_rate": 1e-06, + "loss": 0.3641, + "mean_token_accuracy": 0.8836194276809692, + "num_tokens": 892698662.0, + "step": 23400 + }, + { + "epoch": 2.9768477292965274, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5214102268218994, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8868627548217773, + "num_tokens": 892739263.0, + "step": 23401 + }, + { + "epoch": 2.9769749395751175, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7304620742797852, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8739312887191772, + "num_tokens": 892774986.0, + "step": 23402 + }, + { + "epoch": 2.977102149853708, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.616223692893982, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8868570327758789, + "num_tokens": 892814807.0, + "step": 23403 + }, + { + "epoch": 2.9772293601322986, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7652547359466553, + "learning_rate": 1e-06, + "loss": 0.3325, + "mean_token_accuracy": 0.8900122046470642, + "num_tokens": 892844349.0, + "step": 23404 + }, + { + "epoch": 2.977356570410889, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.676446795463562, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8839112520217896, + "num_tokens": 892879771.0, + "step": 23405 + }, + { + "epoch": 2.9774837806894796, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6764955520629883, + "learning_rate": 1e-06, + "loss": 0.3261, + "mean_token_accuracy": 0.8934029340744019, + "num_tokens": 892912160.0, + "step": 23406 + }, + { + "epoch": 2.97761099096807, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5249478816986084, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8759230971336365, + "num_tokens": 892954221.0, + "step": 23407 + }, + { + "epoch": 2.9777382012466607, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7152600288391113, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8425049781799316, + "num_tokens": 892995745.0, + "step": 23408 + }, + { + "epoch": 2.977865411525251, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.719289779663086, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8808648586273193, + "num_tokens": 893034826.0, + "step": 23409 + }, + { + "epoch": 2.9779926218038417, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6880625486373901, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8719905614852905, + "num_tokens": 893070126.0, + "step": 23410 + }, + { + "epoch": 2.9781198320824323, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.577880620956421, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8550152778625488, + "num_tokens": 893113499.0, + "step": 23411 + }, + { + "epoch": 2.978247042361023, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.67208731174469, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8721693158149719, + "num_tokens": 893148574.0, + "step": 23412 + }, + { + "epoch": 2.9783742526396133, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5680679082870483, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8791794776916504, + "num_tokens": 893184790.0, + "step": 23413 + }, + { + "epoch": 2.978501462918204, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 3.817363739013672, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8540996313095093, + "num_tokens": 893220244.0, + "step": 23414 + }, + { + "epoch": 2.9786286731967944, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7546977996826172, + "learning_rate": 1e-06, + "loss": 0.3514, + "mean_token_accuracy": 0.8842163681983948, + "num_tokens": 893255116.0, + "step": 23415 + }, + { + "epoch": 2.978755883475385, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.682233214378357, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8640830516815186, + "num_tokens": 893295067.0, + "step": 23416 + }, + { + "epoch": 2.9788830937539754, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7499113082885742, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8640389442443848, + "num_tokens": 893336582.0, + "step": 23417 + }, + { + "epoch": 2.979010304032566, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6051117181777954, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8730340600013733, + "num_tokens": 893375577.0, + "step": 23418 + }, + { + "epoch": 2.9791375143111565, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.645673394203186, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.873358964920044, + "num_tokens": 893414151.0, + "step": 23419 + }, + { + "epoch": 2.979264724589747, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6156364679336548, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8872361779212952, + "num_tokens": 893452565.0, + "step": 23420 + }, + { + "epoch": 2.979391934868337, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5855203866958618, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.86942458152771, + "num_tokens": 893499442.0, + "step": 23421 + }, + { + "epoch": 2.979519145146928, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.8308833837509155, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8731504678726196, + "num_tokens": 893533189.0, + "step": 23422 + }, + { + "epoch": 2.979646355425518, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5182589292526245, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8826867341995239, + "num_tokens": 893572472.0, + "step": 23423 + }, + { + "epoch": 2.979773565704109, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5559513568878174, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.883447527885437, + "num_tokens": 893613452.0, + "step": 23424 + }, + { + "epoch": 2.979900775982699, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5859369039535522, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8713558912277222, + "num_tokens": 893652299.0, + "step": 23425 + }, + { + "epoch": 2.98002798626129, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.8360111713409424, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8591610193252563, + "num_tokens": 893687930.0, + "step": 23426 + }, + { + "epoch": 2.9801551965398803, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6769403219223022, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.867668867111206, + "num_tokens": 893721544.0, + "step": 23427 + }, + { + "epoch": 2.980282406818471, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8219566345214844, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8758680820465088, + "num_tokens": 893757723.0, + "step": 23428 + }, + { + "epoch": 2.9804096170970613, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6645607948303223, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8727536797523499, + "num_tokens": 893794306.0, + "step": 23429 + }, + { + "epoch": 2.980536827375652, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7198492288589478, + "learning_rate": 1e-06, + "loss": 0.3471, + "mean_token_accuracy": 0.8882573246955872, + "num_tokens": 893825226.0, + "step": 23430 + }, + { + "epoch": 2.9806640376542424, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7920795679092407, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8745315074920654, + "num_tokens": 893859103.0, + "step": 23431 + }, + { + "epoch": 2.980791247932833, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7433793544769287, + "learning_rate": 1e-06, + "loss": 0.3067, + "mean_token_accuracy": 0.9005349278450012, + "num_tokens": 893893433.0, + "step": 23432 + }, + { + "epoch": 2.9809184582114234, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.8230692148208618, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8744395971298218, + "num_tokens": 893928179.0, + "step": 23433 + }, + { + "epoch": 2.981045668490014, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.835275411605835, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8711302280426025, + "num_tokens": 893963482.0, + "step": 23434 + }, + { + "epoch": 2.9811728787686045, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.532160997390747, + "learning_rate": 1e-06, + "loss": 0.3032, + "mean_token_accuracy": 0.9009791612625122, + "num_tokens": 893995771.0, + "step": 23435 + }, + { + "epoch": 2.981300089047195, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.8188890218734741, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8729826211929321, + "num_tokens": 894029713.0, + "step": 23436 + }, + { + "epoch": 2.9814272993257855, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5158735513687134, + "learning_rate": 1e-06, + "loss": 0.3425, + "mean_token_accuracy": 0.8879876136779785, + "num_tokens": 894069956.0, + "step": 23437 + }, + { + "epoch": 2.981554509604376, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.700140118598938, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8759528398513794, + "num_tokens": 894105455.0, + "step": 23438 + }, + { + "epoch": 2.9816817198829666, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8178818225860596, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8756117820739746, + "num_tokens": 894136047.0, + "step": 23439 + }, + { + "epoch": 2.981808930161557, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.54932701587677, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8888091444969177, + "num_tokens": 894174724.0, + "step": 23440 + }, + { + "epoch": 2.9819361404401477, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6609060764312744, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8701037168502808, + "num_tokens": 894208640.0, + "step": 23441 + }, + { + "epoch": 2.982063350718738, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7225656509399414, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8887450695037842, + "num_tokens": 894239794.0, + "step": 23442 + }, + { + "epoch": 2.9821905609973287, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8289968967437744, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8586269021034241, + "num_tokens": 894271652.0, + "step": 23443 + }, + { + "epoch": 2.9823177712759192, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.523343801498413, + "learning_rate": 1e-06, + "loss": 0.3461, + "mean_token_accuracy": 0.8869150876998901, + "num_tokens": 894309273.0, + "step": 23444 + }, + { + "epoch": 2.9824449815545098, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.675639033317566, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8753242492675781, + "num_tokens": 894343617.0, + "step": 23445 + }, + { + "epoch": 2.9825721918331, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6111928224563599, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8787960410118103, + "num_tokens": 894378754.0, + "step": 23446 + }, + { + "epoch": 2.982699402111691, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5562527179718018, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8743793368339539, + "num_tokens": 894419358.0, + "step": 23447 + }, + { + "epoch": 2.982826612390281, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5664982795715332, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8705565333366394, + "num_tokens": 894457314.0, + "step": 23448 + }, + { + "epoch": 2.982953822668872, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5186026096343994, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8870277404785156, + "num_tokens": 894499105.0, + "step": 23449 + }, + { + "epoch": 2.983081032947462, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4313398599624634, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8711283206939697, + "num_tokens": 894543218.0, + "step": 23450 + }, + { + "epoch": 2.983208243226053, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6001867055892944, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.865278959274292, + "num_tokens": 894582947.0, + "step": 23451 + }, + { + "epoch": 2.983335453504643, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5017025470733643, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8806352615356445, + "num_tokens": 894625398.0, + "step": 23452 + }, + { + "epoch": 2.9834626637832335, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5820467472076416, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8847857713699341, + "num_tokens": 894662578.0, + "step": 23453 + }, + { + "epoch": 2.983589874061824, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6169930696487427, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.876008927822113, + "num_tokens": 894703154.0, + "step": 23454 + }, + { + "epoch": 2.9837170843404146, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.8511654138565063, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8750924468040466, + "num_tokens": 894736684.0, + "step": 23455 + }, + { + "epoch": 2.983844294619005, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5702569484710693, + "learning_rate": 1e-06, + "loss": 0.3508, + "mean_token_accuracy": 0.8876314163208008, + "num_tokens": 894773337.0, + "step": 23456 + }, + { + "epoch": 2.9839715048975957, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7161005735397339, + "learning_rate": 1e-06, + "loss": 0.3472, + "mean_token_accuracy": 0.8881478309631348, + "num_tokens": 894804911.0, + "step": 23457 + }, + { + "epoch": 2.984098715176186, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5977944135665894, + "learning_rate": 1e-06, + "loss": 0.3289, + "mean_token_accuracy": 0.8913822174072266, + "num_tokens": 894840056.0, + "step": 23458 + }, + { + "epoch": 2.9842259254547767, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5905697345733643, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8730069398880005, + "num_tokens": 894881381.0, + "step": 23459 + }, + { + "epoch": 2.9843531357333672, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7320383787155151, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8664186596870422, + "num_tokens": 894922806.0, + "step": 23460 + }, + { + "epoch": 2.9844803460119578, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6740574836730957, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.868008553981781, + "num_tokens": 894960983.0, + "step": 23461 + }, + { + "epoch": 2.9846075562905483, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5634299516677856, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8623730540275574, + "num_tokens": 895004842.0, + "step": 23462 + }, + { + "epoch": 2.984734766569139, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6922308206558228, + "learning_rate": 1e-06, + "loss": 0.3199, + "mean_token_accuracy": 0.8945376873016357, + "num_tokens": 895034602.0, + "step": 23463 + }, + { + "epoch": 2.9848619768477294, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5278067588806152, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8688473701477051, + "num_tokens": 895076912.0, + "step": 23464 + }, + { + "epoch": 2.98498918712632, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5389372110366821, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8803734183311462, + "num_tokens": 895116446.0, + "step": 23465 + }, + { + "epoch": 2.9851163974049104, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5837910175323486, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.863869845867157, + "num_tokens": 895156268.0, + "step": 23466 + }, + { + "epoch": 2.985243607683501, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5884895324707031, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8719913959503174, + "num_tokens": 895194255.0, + "step": 23467 + }, + { + "epoch": 2.9853708179620915, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.63036048412323, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.866486132144928, + "num_tokens": 895232723.0, + "step": 23468 + }, + { + "epoch": 2.985498028240682, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5736949443817139, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8745340704917908, + "num_tokens": 895272475.0, + "step": 23469 + }, + { + "epoch": 2.9856252385192725, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.4648252725601196, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8839824199676514, + "num_tokens": 895315231.0, + "step": 23470 + }, + { + "epoch": 2.9857524487978626, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6323236227035522, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8818349242210388, + "num_tokens": 895350963.0, + "step": 23471 + }, + { + "epoch": 2.9858796590764536, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.632910966873169, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8772380352020264, + "num_tokens": 895388499.0, + "step": 23472 + }, + { + "epoch": 2.9860068693550437, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5326430797576904, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8747581243515015, + "num_tokens": 895429357.0, + "step": 23473 + }, + { + "epoch": 2.9861340796336346, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5975664854049683, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.886210024356842, + "num_tokens": 895465414.0, + "step": 23474 + }, + { + "epoch": 2.9862612899122247, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6776741743087769, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8767260313034058, + "num_tokens": 895500960.0, + "step": 23475 + }, + { + "epoch": 2.9863885001908153, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7418110370635986, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8777530193328857, + "num_tokens": 895536126.0, + "step": 23476 + }, + { + "epoch": 2.986515710469406, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5614473819732666, + "learning_rate": 1e-06, + "loss": 0.3159, + "mean_token_accuracy": 0.8978167772293091, + "num_tokens": 895572905.0, + "step": 23477 + }, + { + "epoch": 2.9866429207479963, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6968001127243042, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8693801164627075, + "num_tokens": 895608850.0, + "step": 23478 + }, + { + "epoch": 2.986770131026587, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6283509731292725, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8639137744903564, + "num_tokens": 895652107.0, + "step": 23479 + }, + { + "epoch": 2.9868973413051774, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5824064016342163, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.883123517036438, + "num_tokens": 895691251.0, + "step": 23480 + }, + { + "epoch": 2.987024551583768, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5875447988510132, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8828508853912354, + "num_tokens": 895731732.0, + "step": 23481 + }, + { + "epoch": 2.9871517618623584, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6547737121582031, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8775959014892578, + "num_tokens": 895768205.0, + "step": 23482 + }, + { + "epoch": 2.987278972140949, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7711379528045654, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8752723932266235, + "num_tokens": 895798916.0, + "step": 23483 + }, + { + "epoch": 2.9874061824195395, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5382969379425049, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8756347894668579, + "num_tokens": 895843127.0, + "step": 23484 + }, + { + "epoch": 2.98753339269813, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.683498740196228, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8644979596138, + "num_tokens": 895884133.0, + "step": 23485 + }, + { + "epoch": 2.9876606029767205, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.63491952419281, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8692376017570496, + "num_tokens": 895922814.0, + "step": 23486 + }, + { + "epoch": 2.987787813255311, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6251521110534668, + "learning_rate": 1e-06, + "loss": 0.3502, + "mean_token_accuracy": 0.8868759870529175, + "num_tokens": 895955052.0, + "step": 23487 + }, + { + "epoch": 2.9879150235339016, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5613267421722412, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8748090267181396, + "num_tokens": 895997671.0, + "step": 23488 + }, + { + "epoch": 2.988042233812492, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.569446325302124, + "learning_rate": 1e-06, + "loss": 0.3477, + "mean_token_accuracy": 0.8884984850883484, + "num_tokens": 896036269.0, + "step": 23489 + }, + { + "epoch": 2.9881694440910826, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.4938182830810547, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.878534197807312, + "num_tokens": 896079018.0, + "step": 23490 + }, + { + "epoch": 2.988296654369673, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7058228254318237, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8732715845108032, + "num_tokens": 896114202.0, + "step": 23491 + }, + { + "epoch": 2.9884238646482637, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5874531269073486, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8770904541015625, + "num_tokens": 896156122.0, + "step": 23492 + }, + { + "epoch": 2.9885510749268542, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5332732200622559, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8803268671035767, + "num_tokens": 896198516.0, + "step": 23493 + }, + { + "epoch": 2.9886782852054448, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6805049180984497, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8787311315536499, + "num_tokens": 896234752.0, + "step": 23494 + }, + { + "epoch": 2.9888054954840353, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6696592569351196, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8814852237701416, + "num_tokens": 896269731.0, + "step": 23495 + }, + { + "epoch": 2.9889327057626254, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6547913551330566, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8735412359237671, + "num_tokens": 896309029.0, + "step": 23496 + }, + { + "epoch": 2.9890599160412163, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7320139408111572, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8563632965087891, + "num_tokens": 896348565.0, + "step": 23497 + }, + { + "epoch": 2.9891871263198064, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5365527868270874, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8573722839355469, + "num_tokens": 896393526.0, + "step": 23498 + }, + { + "epoch": 2.9893143365983974, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.627906084060669, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8787367939949036, + "num_tokens": 896429518.0, + "step": 23499 + }, + { + "epoch": 2.9894415468769875, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.629608392715454, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.872410774230957, + "num_tokens": 896467538.0, + "step": 23500 + }, + { + "epoch": 2.989568757155578, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6351152658462524, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8783581256866455, + "num_tokens": 896504788.0, + "step": 23501 + }, + { + "epoch": 2.9896959674341685, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6659191846847534, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8667283058166504, + "num_tokens": 896540391.0, + "step": 23502 + }, + { + "epoch": 2.989823177712759, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6120902299880981, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8757665157318115, + "num_tokens": 896578925.0, + "step": 23503 + }, + { + "epoch": 2.9899503879913496, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.8099359273910522, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8646150827407837, + "num_tokens": 896614459.0, + "step": 23504 + }, + { + "epoch": 2.99007759826994, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.4667240381240845, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.8820765018463135, + "num_tokens": 896657330.0, + "step": 23505 + }, + { + "epoch": 2.9902048085485307, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.540959119796753, + "learning_rate": 1e-06, + "loss": 0.3492, + "mean_token_accuracy": 0.883644700050354, + "num_tokens": 896699379.0, + "step": 23506 + }, + { + "epoch": 2.990332018827121, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6216278076171875, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8746026158332825, + "num_tokens": 896738670.0, + "step": 23507 + }, + { + "epoch": 2.9904592291057117, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5662188529968262, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8682316541671753, + "num_tokens": 896779103.0, + "step": 23508 + }, + { + "epoch": 2.9905864393843022, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.4991363286972046, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.875866174697876, + "num_tokens": 896819682.0, + "step": 23509 + }, + { + "epoch": 2.9907136496628928, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.525967001914978, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8699905872344971, + "num_tokens": 896861805.0, + "step": 23510 + }, + { + "epoch": 2.9908408599414833, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.735616683959961, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8748455047607422, + "num_tokens": 896900384.0, + "step": 23511 + }, + { + "epoch": 2.990968070220074, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5910979509353638, + "learning_rate": 1e-06, + "loss": 0.3578, + "mean_token_accuracy": 0.8864014744758606, + "num_tokens": 896937903.0, + "step": 23512 + }, + { + "epoch": 2.9910952804986644, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6322287321090698, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8685627579689026, + "num_tokens": 896973905.0, + "step": 23513 + }, + { + "epoch": 2.991222490777255, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5860086679458618, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8688251376152039, + "num_tokens": 897012192.0, + "step": 23514 + }, + { + "epoch": 2.9913497010558454, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5564419031143188, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8713253736495972, + "num_tokens": 897055514.0, + "step": 23515 + }, + { + "epoch": 2.991476911334436, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6047285795211792, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8786916732788086, + "num_tokens": 897091461.0, + "step": 23516 + }, + { + "epoch": 2.9916041216130265, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5457279682159424, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8804817199707031, + "num_tokens": 897129781.0, + "step": 23517 + }, + { + "epoch": 2.991731331891617, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6638702154159546, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8721065521240234, + "num_tokens": 897167842.0, + "step": 23518 + }, + { + "epoch": 2.991858542170207, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.590806484222412, + "learning_rate": 1e-06, + "loss": 0.3182, + "mean_token_accuracy": 0.8948574066162109, + "num_tokens": 897202109.0, + "step": 23519 + }, + { + "epoch": 2.991985752448798, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7608104944229126, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8724422454833984, + "num_tokens": 897232504.0, + "step": 23520 + }, + { + "epoch": 2.992112962727388, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5428171157836914, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8806325197219849, + "num_tokens": 897268494.0, + "step": 23521 + }, + { + "epoch": 2.992240173005979, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.8268144130706787, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8749035596847534, + "num_tokens": 897298888.0, + "step": 23522 + }, + { + "epoch": 2.992367383284569, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.4966285228729248, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8702243566513062, + "num_tokens": 897348225.0, + "step": 23523 + }, + { + "epoch": 2.99249459356316, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5898922681808472, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8848239183425903, + "num_tokens": 897387030.0, + "step": 23524 + }, + { + "epoch": 2.9926218038417502, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6540420055389404, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.854259192943573, + "num_tokens": 897425775.0, + "step": 23525 + }, + { + "epoch": 2.992749014120341, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.8273158073425293, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.87458735704422, + "num_tokens": 897455552.0, + "step": 23526 + }, + { + "epoch": 2.9928762243989313, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.712069034576416, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8779481649398804, + "num_tokens": 897494176.0, + "step": 23527 + }, + { + "epoch": 2.993003434677522, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5286024808883667, + "learning_rate": 1e-06, + "loss": 0.3421, + "mean_token_accuracy": 0.8899040818214417, + "num_tokens": 897532424.0, + "step": 23528 + }, + { + "epoch": 2.9931306449561124, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.665334701538086, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8852269053459167, + "num_tokens": 897568731.0, + "step": 23529 + }, + { + "epoch": 2.993257855234703, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.708397626876831, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8685754537582397, + "num_tokens": 897608587.0, + "step": 23530 + }, + { + "epoch": 2.9933850655132934, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.626080870628357, + "learning_rate": 1e-06, + "loss": 0.3355, + "mean_token_accuracy": 0.8919436931610107, + "num_tokens": 897644497.0, + "step": 23531 + }, + { + "epoch": 2.993512275791884, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6023496389389038, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8772441744804382, + "num_tokens": 897680583.0, + "step": 23532 + }, + { + "epoch": 2.9936394860704745, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5241035223007202, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8859516382217407, + "num_tokens": 897718657.0, + "step": 23533 + }, + { + "epoch": 2.993766696349065, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6936200857162476, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8721218705177307, + "num_tokens": 897752789.0, + "step": 23534 + }, + { + "epoch": 2.9938939066276555, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.4809772968292236, + "learning_rate": 1e-06, + "loss": 0.3245, + "mean_token_accuracy": 0.894892692565918, + "num_tokens": 897791254.0, + "step": 23535 + }, + { + "epoch": 2.994021116906246, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6798760890960693, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8698703050613403, + "num_tokens": 897830905.0, + "step": 23536 + }, + { + "epoch": 2.9941483271848366, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5990228652954102, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8810591697692871, + "num_tokens": 897870241.0, + "step": 23537 + }, + { + "epoch": 2.994275537463427, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6366890668869019, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8717039227485657, + "num_tokens": 897908775.0, + "step": 23538 + }, + { + "epoch": 2.9944027477420176, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7013261318206787, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.867255687713623, + "num_tokens": 897949799.0, + "step": 23539 + }, + { + "epoch": 2.994529958020608, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.4706624746322632, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8716174960136414, + "num_tokens": 897992798.0, + "step": 23540 + }, + { + "epoch": 2.9946571682991987, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6182552576065063, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8709955215454102, + "num_tokens": 898031668.0, + "step": 23541 + }, + { + "epoch": 2.9947843785777892, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.501253366470337, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8691535592079163, + "num_tokens": 898079828.0, + "step": 23542 + }, + { + "epoch": 2.9949115888563798, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.616631269454956, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8785535097122192, + "num_tokens": 898116808.0, + "step": 23543 + }, + { + "epoch": 2.99503879913497, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.4551235437393188, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8772246241569519, + "num_tokens": 898161890.0, + "step": 23544 + }, + { + "epoch": 2.995166009413561, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6185468435287476, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8717460632324219, + "num_tokens": 898198992.0, + "step": 23545 + }, + { + "epoch": 2.995293219692151, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6331056356430054, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8747804164886475, + "num_tokens": 898240714.0, + "step": 23546 + }, + { + "epoch": 2.995420429970742, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6886167526245117, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8692139387130737, + "num_tokens": 898281800.0, + "step": 23547 + }, + { + "epoch": 2.995547640249332, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6265374422073364, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8587534427642822, + "num_tokens": 898321373.0, + "step": 23548 + }, + { + "epoch": 2.995674850527923, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5151442289352417, + "learning_rate": 1e-06, + "loss": 0.359, + "mean_token_accuracy": 0.8830801844596863, + "num_tokens": 898361778.0, + "step": 23549 + }, + { + "epoch": 2.995802060806513, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6682714223861694, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.879576563835144, + "num_tokens": 898396146.0, + "step": 23550 + }, + { + "epoch": 2.9959292710851035, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6448525190353394, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8584450483322144, + "num_tokens": 898437273.0, + "step": 23551 + }, + { + "epoch": 2.996056481363694, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5175918340682983, + "learning_rate": 1e-06, + "loss": 0.3318, + "mean_token_accuracy": 0.8932604193687439, + "num_tokens": 898478915.0, + "step": 23552 + }, + { + "epoch": 2.9961836916422846, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.5822184085845947, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8762099742889404, + "num_tokens": 898517552.0, + "step": 23553 + }, + { + "epoch": 2.996310901920875, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.58460533618927, + "learning_rate": 1e-06, + "loss": 0.3411, + "mean_token_accuracy": 0.8900871276855469, + "num_tokens": 898557593.0, + "step": 23554 + }, + { + "epoch": 2.9964381121994657, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.553310513496399, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8691791296005249, + "num_tokens": 898599579.0, + "step": 23555 + }, + { + "epoch": 2.996565322478056, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6983206272125244, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8774718046188354, + "num_tokens": 898638451.0, + "step": 23556 + }, + { + "epoch": 2.9966925327566467, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5301991701126099, + "learning_rate": 1e-06, + "loss": 0.3484, + "mean_token_accuracy": 0.8867326974868774, + "num_tokens": 898677472.0, + "step": 23557 + }, + { + "epoch": 2.9968197430352372, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.705545425415039, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8824604749679565, + "num_tokens": 898713318.0, + "step": 23558 + }, + { + "epoch": 2.9969469533138278, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.5764096975326538, + "learning_rate": 1e-06, + "loss": 0.3548, + "mean_token_accuracy": 0.8852649331092834, + "num_tokens": 898750817.0, + "step": 23559 + }, + { + "epoch": 2.9970741635924183, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.7038400173187256, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8614459037780762, + "num_tokens": 898789749.0, + "step": 23560 + }, + { + "epoch": 2.997201373871009, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.573438048362732, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8751985430717468, + "num_tokens": 898829893.0, + "step": 23561 + }, + { + "epoch": 2.9973285841495994, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.7588632106781006, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8669945001602173, + "num_tokens": 898865119.0, + "step": 23562 + }, + { + "epoch": 2.99745579442819, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.8247450590133667, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8616356253623962, + "num_tokens": 898902611.0, + "step": 23563 + }, + { + "epoch": 2.9975830047067804, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6159690618515015, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8710414171218872, + "num_tokens": 898940496.0, + "step": 23564 + }, + { + "epoch": 2.997710214985371, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6119179725646973, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8613568544387817, + "num_tokens": 898980142.0, + "step": 23565 + }, + { + "epoch": 2.9978374252639615, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6488213539123535, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8705661296844482, + "num_tokens": 899015594.0, + "step": 23566 + }, + { + "epoch": 2.997964635542552, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.656570315361023, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8835867643356323, + "num_tokens": 899050220.0, + "step": 23567 + }, + { + "epoch": 2.9980918458211425, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6424285173416138, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8719257712364197, + "num_tokens": 899090583.0, + "step": 23568 + }, + { + "epoch": 2.9982190560997326, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6178977489471436, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8789916038513184, + "num_tokens": 899128428.0, + "step": 23569 + }, + { + "epoch": 2.9983462663783236, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.552540898323059, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.882099449634552, + "num_tokens": 899169376.0, + "step": 23570 + }, + { + "epoch": 2.9984734766569137, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.6007156372070312, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8808380365371704, + "num_tokens": 899207292.0, + "step": 23571 + }, + { + "epoch": 2.9986006869355046, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.65537691116333, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8609853982925415, + "num_tokens": 899246381.0, + "step": 23572 + }, + { + "epoch": 2.9987278972140947, + "ewc_loss": 2.956390380859375e-05, + "grad_norm": 1.6904337406158447, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8805665373802185, + "num_tokens": 899285395.0, + "step": 23573 + }, + { + "epoch": 2.9988551074926852, + "ewc_loss": 2.944469451904297e-05, + "grad_norm": 1.597167730331421, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8738960027694702, + "num_tokens": 899326740.0, + "step": 23574 + }, + { + "epoch": 2.9989823177712758, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.4899317026138306, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8768998384475708, + "num_tokens": 899370037.0, + "step": 23575 + }, + { + "epoch": 2.9991095280498663, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6631805896759033, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.872421383857727, + "num_tokens": 899407970.0, + "step": 23576 + }, + { + "epoch": 2.999236738328457, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7722605466842651, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8862493634223938, + "num_tokens": 899440035.0, + "step": 23577 + }, + { + "epoch": 2.9993639486070474, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.568625569343567, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8714574575424194, + "num_tokens": 899479622.0, + "step": 23578 + }, + { + "epoch": 2.999491158885638, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.5742686986923218, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8820579648017883, + "num_tokens": 899521333.0, + "step": 23579 + }, + { + "epoch": 2.9996183691642284, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.9202786684036255, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8643057346343994, + "num_tokens": 899554791.0, + "step": 23580 + }, + { + "epoch": 2.999745579442819, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6684644222259521, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8810577392578125, + "num_tokens": 899589234.0, + "step": 23581 + }, + { + "epoch": 2.9998727897214095, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.6100927591323853, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8760053515434265, + "num_tokens": 899627746.0, + "step": 23582 + }, + { + "epoch": 3.0, + "ewc_loss": 2.968311309814453e-05, + "grad_norm": 1.7048763036727905, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8720717430114746, + "num_tokens": 899663974.0, + "step": 23583 + }, + { + "epoch": 3.0, + "ewc_loss": 2.968311309814453e-05, + "step": 23583, + "total_flos": 5.628159003328302e+19, + "train_loss": 0.4093159021503956, + "train_runtime": 46764.448, + "train_samples_per_second": 8.068, + "train_steps_per_second": 0.504 + } + ], + "logging_steps": 1, + "max_steps": 23583, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 11792, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.628159003328302e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..60fd48e --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e12e8f68c62f0f12542fd2504c9c5e58fa1e477bb114d3aeebf72b8fdce2f6f +size 13393